Introduction

Assess impacts of motifs across ATAC-seq collected over changing concentrations of Oct4 (Xiong et al (2022)).

Computational Setup

#Standard packages
library(rtracklayer) ; library(GenomicRanges); library(magrittr) ; library(Biostrings)
library(ggplot2) ; library(reshape2); library(plyranges); library(Rsamtools); library(parallel)
library(dplyr); library(data.table); library(patchwork); library(readr); library(testit)

#KNITR Options
setwd("/n/projects/mw2098/publications/2024_weilert_acc/code/2_analysis/")
figure_filepath<-"figures/16_concentration"
options(knitr.figure_dir=figure_filepath, java.parameters = "- Xmx6g")

#Lab sources
source("scripts/r/granges_common.r")
source("scripts/r/metapeak_common.r")
source("scripts/r/knitr_common.r")
source("scripts/r/caching.r")
source("scripts/r/metapeak_functions.r")

#Specific sources
library(BSgenome.Mmusculus.UCSC.mm10)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(ggseqlogo)
source("scripts/r/motif_functions.r")
library(DESeq2)

#Pre-existing variables
bsgenome<-BSgenome.Mmusculus.UCSC.mm10
txdb<-TxDb.Mmusculus.UCSC.mm10.knownGene
regions.path<-'bed/mapped_motifs/all_xiong_islands_curated_0based_sized_to_output.bed'
motifs.path<-'tsv/mapped_motifs/all_xiong_instances_curated_0based_w_perturb.tsv.gz'
supplementary_motifs.path<-'modiscolite/atac_wt_fold1_atac_counts/hits.tsv'

motif_to_task.list<-list('Oct4-Sox2' = 'oct4', 'Oct4' = 'oct4', 'Sox2' = 'sox2', 'Klf4' = 'klf4')
motif_to_task.df<-data.frame(task = motif_to_task.list %>% unlist, motif = names(motif_to_task.list))

models_of_interest<-c('atac_0h', 'atac_3h', 'atac_6h', 'atac_9h', 'atac_12h', 'atac_15h')
models.list<-list(atac_0h = c('atac'),
                  atac_3h = c('atac'),
                  atac_6h = c('atac'),
                  atac_9h = c('atac'),
                  atac_12h = c('atac'),
                  atac_15h = c('atac'))

os_patterns.list<-list(
  `Oct4-Sox2` = list(
    atac_0h = c('pattern_2'),
    atac_3h = c('pattern_3', 'pattern_10', 'pattern_15'),
    atac_6h = c('pattern_3', 'pattern_8', 'pattern_15'),
    atac_9h = c('pattern_11'),
    atac_12h = c('pattern_20'),
    atac_15h = c(NA)
  ),
  Sox2 = list(
    atac_0h = c('pattern_1'),
    atac_3h = c('pattern_2', 'pattern_4'),
    atac_6h = c('pattern_1'),
    atac_9h = c('pattern_0'),
    atac_12h = c('pattern_1', 'pattern_2'),
    atac_15h = c('pattern_1', 'pattern_3')
  )
)

os_orientation.list<-list(
  `Oct4-Sox2` = list(
    atac_0h = c('-'),
    atac_3h = c('-'),
    atac_6h = c('+'),
    atac_9h = c('-'),
    atac_12h = c('-'),
    atac_15h = c(NA)
  ),
  Sox2 = list(
    atac_0h = c('+'),
    atac_3h = c('+'),
    atac_6h = c('+'),
    atac_9h = c('+'),
    atac_12h = c('+'),
    atac_15h = c('+')
  )
)

distance_class_breaks<-c(0, 70, 180, 500, 1000)
distance_class_labels<-c('protein-range', 'nucleosome-range', 'enhancer-range', 'long-range')
marginalized_affinities.df<-readr::read_tsv('tsv/insilico/marginalizations/motifs/all_xiong_marginalizations.tsv.gz')
count_threshold<-20

acc_motif_to_task.list<-list('Oct4-Sox2' = 'atac', 'Sox2' = 'atac', 'Klf4' = 'atac', 'Ctcf' = 'atac')
acc_motif_to_task.df<-data.frame(key_task = acc_motif_to_task.list %>% unlist, raw_motifA = names(acc_motif_to_task.list))

Mark concentrations

treatments<-paste0(seq(0, 15, 3), 'h')
conc.df<-data.frame(treatment = treatments, 
                    Oct4 = c(1, .875, .505, .17, .07, .025),
                    Sox2 = c(1, 1.268, 1.126, .324, .164, .017)) %>%
    dplyr::mutate(treatment_numeric = gsub('h', '', treatment) %>% as.numeric(),
                  Sox2_alpha_norm = Sox2/max(Sox2))

Import motifs, perturbation scores, and marginalization scores

Import motifs and interpretation scores.

Read in 0h motifs and curate

motifs.gr<-readr::read_tsv(motifs.path) %>% 
  dplyr::group_by(motif) %>%
  dplyr::mutate(seq_match_quantile = ecdf(seq_match)(seq_match),
                seq_match_quantile_bin = cut(seq_match_quantile, breaks = c(0, .33, .67, 1), include.lowest = T, labels = paste0('smq_', 1:3)),
                region_id = as.integer(region_id)) %>%
  makeGRangesFromDataFrame(keep.extra.columns = T, starts.in.df.are.0based = T, 
                                                                     seqnames.field = 'seqnames', start.field = 'start', end.field = 'end', 
                                                                     strand.field = 'strand')

Read in islands and format

#Read in islands
regions.gr<-rtracklayer::import(regions.path) %>%
  makeGRangesFromDataFrame(keep.extra.columns = T, starts.in.df.are.0based = F, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end') %>%
  plyranges::mutate(atac_obs = regionSums(., 'bw/GSE174774_mesc_atac_0h_cutsites_combined.bw'),
                    atac_obs_q = ecdf(atac_obs)(atac_obs),
                    atac_obs_q_bin = plyr::round_any(atac_obs_q, accuracy = .2, f = ceiling),
                    region_id = as.integer(name),
                    island_seq = getSeq(bsgenome, GRanges(seqnames, IRanges(start, end)), as.character = T),
                    C = stringr::str_count(island_seq, 'C'),
                    G = stringr::str_count(island_seq, 'G'),
                    CpG = stringr::str_count(island_seq, 'CG'),
                    CpG_ratio = round((CpG) / ((((C + G) / 2) ^ 2) / width), 2))

Process supplemental motifs

Add in Ctcf motifs for a control comparison:

ctcf_motifs.gr<-readr::read_tsv(supplementary_motifs.path) %>% 
  dplyr::filter(pattern_name == 'pattern_4', metacluster_name == 'pos_patterns') %>%
  dplyr::mutate(motif='Ctcf') %>%
  makeGRangesFromDataFrame(keep.extra.columns = T, starts.in.df.are.0based = T)

motifs.gr<-c(motifs.gr, ctcf_motifs.gr)

Collect supplementary motifs that contained TF-MoDISco seqlets >200 or were of high-confidence identity. We will use these to delineate “grouped” and “isolated” classifications of Oct4-Sox2 or Sox2.

# other_motifs_all.gr<-readr::read_tsv(supplementary_motifs.path) %>% 
#   dplyr::filter(pattern_name %in% paste0('pattern_', c(3, 4, 6, 7, 8, 10, 11, 12, 13, 14)), metacluster_name == 'pos_patterns') %>%
#   # dplyr::mutate(motif='Ctcf') %>%
#   makeGRangesFromDataFrame(keep.extra.columns = T, starts.in.df.are.0based = T) %>%
#   plyranges::mutate(motif = short_name) %>% 
#   plyranges::filter(!overlapsAny(., motifs.gr, ignore.strand = T))
# 
# ov<-findOverlaps(other_motifs_all.gr, regions.gr, ignore.strand = T)
# other_motifs.gr<-other_motifs_all.gr[ov@from]
# other_motifs.gr$region_id<-regions.gr$region_id[ov@to]
# 
# other_motifs_filt.gr<-lapply(other_motifs.gr$pattern_name %>% unique, function(x){
#   motifs_dedup.gr<-remove_palindromic_motifs_from_bpnet_instances(other_motifs.gr,
#                                                                 pattern_to_filter = x, 
#                                                                 chrom_name = 'seqnames', 
#                                                                 start_name = 'start', 
#                                                                 end_name = 'end', 
#                                                                 value_name = 'contrib_match', 
#                                                                 motif_name = 'pattern_name',
#                                                                 region_name = 'region_id')
#   return(motifs_dedup.gr)
# }) %>% as('GRangesList') %>% unlist()

Find expanded islands

Find expanded islands that consider all contributing motifs to accessibility.

island_classes.df<-motifs.gr %>% plyranges::filter(motif!='Ctcf') %>% #c(motifs.gr, other_motifs.gr) %>% #Include other motifs
  GenomicRanges::sort(ignore.strand = T) %>%
  as.data.frame %>%
  dplyr::group_by(region_id) %>%
  dplyr::summarize(island_content = paste(unique(sort(motif)), table(sort(motif)), sep = ':', collapse = '_'),
                   island_content_ordered = paste(motif, collapse = '_'),
                   island_content_unique = paste(unique(sort(motif)), collapse = '_'),
                   island_count = length(motif))

regions.gr<-regions.gr %>% as.data.frame %>% dplyr::left_join(., island_classes.df) %>%
  makeGRangesFromDataFrame(keep.extra.columns = T, starts.in.df.are.0based = F)
motifs.gr<-motifs.gr %>% as.data.frame %>% dplyr::left_join(., island_classes.df) %>%
  makeGRangesFromDataFrame(keep.extra.columns = T, starts.in.df.are.0based = F, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end')

#Reduce islands in order to accommodate enhancers that are grouped together.
reduced_regions.gr<-regions.gr %>% GenomicRanges::reduce(ignore.strand = T)
ov<-findOverlaps(regions.gr, reduced_regions.gr, ignore.strand = T)
regions.gr$reduced_region_id<-ov@to

Collect data

Define predicted .bw files

pred.bw.list<-list(`0h` = 'preds/atac_0h_fold1_atac_atac_peaks_all.bw',
                   `3h` = 'preds/atac_3h_fold1_atac_atac_peaks_all.bw',
                   `6h` = 'preds/atac_6h_fold1_atac_atac_peaks_all.bw',
                   `9h` = 'preds/atac_9h_fold1_atac_atac_peaks_all.bw',
                   `12h` = 'preds/atac_12h_fold1_atac_atac_peaks_all.bw',
                   `15h` = 'preds/atac_15h_fold1_atac_atac_peaks_all.bw')

shap.bw.list<-list(`0h` = 'shap/atac_0h_fold1_atac_counts.bw',
                   `3h` = 'shap/atac_3h_fold1_atac_counts.bw',
                   `6h` = 'shap/atac_6h_fold1_atac_counts.bw',
                   `9h` = 'shap/atac_9h_fold1_atac_counts.bw',
                   `12h` = 'shap/atac_12h_fold1_atac_counts.bw',
                   `15h` = 'shap/atac_15h_fold1_atac_counts.bw')

norm.atac.bw.list<-list(`0h` = 'bw/GSE174774_mesc_atac_0h_combined_normalized.bw',
                        `3h` = 'bw/GSE174774_mesc_atac_3h_combined_normalized.bw',
                        `6h` = 'bw/GSE174774_mesc_atac_6h_combined_normalized.bw',
                        `9h` = 'bw/GSE174774_mesc_atac_9h_combined_normalized.bw',
                        `12h` = 'bw/GSE174774_mesc_atac_12h_combined_normalized.bw',
                        `15h` = 'bw/GSE174774_mesc_atac_15h_combined_normalized.bw')

Collect SHAP scores across motifs

shap.df<-mclapply(names(shap.bw.list), function(x){
  regionSums(motifs.gr, shap.bw.list[[x]])
}, mc.cores = 2) %>% as.data.frame()
names(shap.df)<-names(shap.bw.list)
motifs.df<-cbind(motifs.gr %>% as.data.frame, shap.df)

Collect normalized accessibility across regions

norm.atac.df<-mclapply(names(norm.atac.bw.list), function(x){
  regionSums(regions.gr, 
             norm.atac.bw.list[[x]])
}, mc.cores = 2) %>% as.data.frame()
names(norm.atac.df)<-names(norm.atac.bw.list)
regions_w_atac.df<-cbind(regions.gr %>% as.data.frame, norm.atac.df)

Collect predicted accessibility across regions

pred.atac.df<-mclapply(names(pred.bw.list), function(x){
  regionSums(regions.gr, 
             pred.bw.list[[x]])
}, mc.cores = 2) %>% as.data.frame()
names(pred.atac.df)<-names(pred.bw.list)
regions_w_pred_atac.df<-cbind(regions.gr %>% as.data.frame, pred.atac.df)

Collect marginalization scores

marg_all.df<-lapply(models_of_interest, function(x){
  new_col<-paste0('marg_', x)
  
  marg.df<-Sys.glob(paste0('tsv/insilico/marginalizations/motifs/xiong_marginalizations_*', x, '*.tsv.gz')) %>% readr::read_tsv() 
  null_val<-marg.df %>% dplyr::filter(state=='null') %>% .$atac
  print(null_val)
  marg.df[[new_col]]<-marg.df$atac - null_val
  
  return(marg.df[[new_col]])
}) %>% as.data.frame()
## [1] 9.420504
## [1] 9.647603
## [1] 9.824745
## [1] 9.599579
## [1] 9.707907
## [1] 9.471004
colnames(marg_all.df)<-paste0('marg_', seq(0, 15, 3), 'h')

marg_all.df<-cbind(Sys.glob(paste0('tsv/insilico/marginalizations/motifs/xiong_marginalizations_atac_0h.tsv.gz')) %>% readr::read_tsv() %>% dplyr::rename(seq = seqA) %>% dplyr::select(seq), marg_all.df)
motifs_w_marg.df<-motifs.df %>% dplyr::left_join(marg_all.df)

Systematically test interpretations across time course

Given contribution scores, isolation scores, context scores, and cooperativity scores (context-isolation), check how they change over Oct4 concentration depletion time course.

motifs_w_perturb.df<-motifs_w_marg.df %>%
  dplyr::rename(`contrib_0h` = `0h`, `contrib_3h` = `3h`, `contrib_6h` = `6h`, `contrib_9h` = `9h`, `contrib_12h` = `12h`, `contrib_15h` = `15h`) %>%
  dplyr::mutate(`perturb_0h` = log(`atac_0h.atac.WT.pred_sum`/`atac_0h.atac.dA.pred_sum`),
                `perturb_3h` = log(`atac_3h.atac.WT.pred_sum`/`atac_3h.atac.dA.pred_sum`),
                `perturb_6h` = log(`atac_6h.atac.WT.pred_sum`/`atac_6h.atac.dA.pred_sum`),
                `perturb_9h` = log(`atac_9h.atac.WT.pred_sum`/`atac_9h.atac.dA.pred_sum`),
                `perturb_12h` = log(`atac_12h.atac.WT.pred_sum`/`atac_12h.atac.dA.pred_sum`),
                `perturb_15h` = log(`atac_15h.atac.WT.pred_sum`/`atac_15h.atac.dA.pred_sum`),
                `coopscore_0h` = perturb_0h - marg_0h,
                `coopscore_3h` = perturb_3h - marg_3h,
                `coopscore_6h` = perturb_6h - marg_6h,
                `coopscore_9h` = perturb_9h - marg_9h,
                `coopscore_12h` = perturb_12h - marg_12h,
                `coopscore_15h` = perturb_15h - marg_15h)

Summarize motifs based on their interpretation scores for plotting.

motifs_by_interp_ungrouped.df<- motifs_w_perturb.df %>%
  dplyr::select(motif, seq_match_quantile_bin, island_count, island_content,
                contrib_0h, contrib_3h, contrib_6h, contrib_9h, contrib_12h, contrib_15h,
                marg_0h, marg_3h, marg_6h, marg_9h, marg_12h, marg_15h,
                perturb_0h, perturb_3h, perturb_6h, perturb_9h, perturb_12h, perturb_15h,
                coopscore_0h, coopscore_3h, coopscore_6h, coopscore_9h, coopscore_12h, coopscore_15h) %>%
  tidyr::pivot_longer(cols = c(-motif, -seq_match_quantile_bin, -island_count, -island_content), names_to = 'interpretation', values_to = 'value') %>%
  tidyr::separate(col = 'interpretation', into = c('interp_type', 'timepoint'), sep = '_') %>%
  dplyr::mutate(timepoint_num = gsub('h', '', timepoint) %>% as.integer())

#Mark whether they're grouped or not.
motifs_by_interp.df<-lapply(c('Sox2','Oct4-Sox2'), function(x){
  #Collect islands belonging to each motif
  if(x=='Oct4-Sox2'){
    grouped.df<-motifs_by_interp_ungrouped.df %>% dplyr::filter(motif==x) %>% 
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      dplyr::filter(grepl('Oct4-Sox2:1', island_content))
  } else{
    grouped.df<-motifs_by_interp_ungrouped.df %>% dplyr::filter(motif==x) %>% 
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      dplyr::filter(!grepl('Oct4-Sox2', island_content))
  }
  return(grouped.df)
  
}) %>% rbindlist()

Summarize by PWM quantile to see the relative effects over time.

motifs_by_interp_summary.df<-motifs_by_interp.df %>% 
    dplyr::group_by(motif, seq_match_quantile_bin, interp_type, timepoint_num, timepoint, island_state) %>%
    dplyr::summarize(med_val = median(value, na.rm = T)) %>%
    dplyr::mutate(interp_type = factor(interp_type, levels = c('contrib','perturb','marg','coopscore'), labels = c('SHAP', 'Context','Isolation','Coop (Context - isolation)')))

abs_interp.plot<-ggplot(motifs_by_interp_summary.df, aes(x = timepoint_num, y = med_val, color = seq_match_quantile_bin, linetype = island_state))+
  geom_hline(yintercept = 0, linetype = 'dashed')+
  geom_point() + geom_line()+
  facet_grid(interp_type ~ motif, scales = 'free') + 
  scale_y_continuous(name = 'Median interpretation value (log-scale)')+
  ggtitle('Absolute loss of interpretation values over time course')+
  theme_classic()+ theme(panel.border = element_rect(fill = NA))
abs_interp.plot

ggsave(paste0(figure_filepath, '/time_course_absolute_loss_of_interp_values.png'), abs_interp.plot, height = 11, width = 8)
ggsave(paste0(figure_filepath, '/time_course_absolute_loss_of_interp_values.pdf'), abs_interp.plot, height = 11, width = 8)

Reimpose given relative loss with reference loss at 0h.

motifs_by_interp_summary_relative.df<-motifs_by_interp_summary.df %>%
  dplyr::ungroup() %>%
  dplyr::left_join(motifs_by_interp_summary.df %>% dplyr::ungroup() %>% 
                     dplyr::filter(timepoint_num == 0, seq_match_quantile_bin=='smq_3') %>% 
                     dplyr::select(-timepoint_num, -timepoint, -seq_match_quantile_bin) %>%
                     dplyr::rename(med_null_val = med_val)) %>%
  dplyr::mutate(med_relative_value = med_val / med_null_val)

rel_interp_0h_q5.plot<-ggplot(motifs_by_interp_summary_relative.df, aes(x = timepoint_num, y = med_relative_value, color = seq_match_quantile_bin, linetype = island_state))+
  geom_hline(yintercept = 0, linetype = 'dashed')+
  geom_point() + geom_line()+
  facet_grid(interp_type ~ motif, scales = 'free') + 
  scale_y_continuous(name = 'Median interpretation value (log-scale)')+
  ggtitle('Relative loss of interpretation values over time course\nAnchored on 0h, q5')+
  theme_classic()+ theme(panel.border = element_rect(fill = NA))

motifs_by_interp_summary_relative.df<-motifs_by_interp_summary.df %>%
  dplyr::ungroup() %>%
  dplyr::left_join(motifs_by_interp_summary.df %>% dplyr::ungroup() %>% 
                     dplyr::filter(timepoint_num == 0) %>% 
                     dplyr::select(-timepoint_num, -timepoint) %>%
                     dplyr::rename(med_null_val = med_val)) %>%
  dplyr::mutate(med_relative_value = med_val / med_null_val)

rel_interp_0h.plot<-ggplot(motifs_by_interp_summary_relative.df, aes(x = timepoint_num, y = med_relative_value, color = seq_match_quantile_bin, linetype = island_state))+
  geom_hline(yintercept = 0, linetype = 'dashed')+
  geom_point() + geom_line()+
  facet_grid(interp_type ~ motif, scales = 'free') + 
  scale_y_continuous(name = 'Median interpretation value (log-scale)')+
  ggtitle('Relative loss of interpretation values over time course\nAnchored on 0h only')+
  theme_classic()+ theme(panel.border = element_rect(fill = NA))


rel_interp.plot<-rel_interp_0h_q5.plot + rel_interp_0h.plot
rel_interp.plot

ggsave(paste0(figure_filepath, '/time_course_relative_loss_of_interp_values.png'), rel_interp.plot, height = 11, width = 16)
ggsave(paste0(figure_filepath, '/time_course_relative_loss_of_interp_values.pdf'), rel_interp.plot, height = 11, width = 16)

Reimpose given slope of changes from transitions.

motifs_by_interp_summary_slope.df<-motifs_by_interp_summary.df %>%
  dplyr::group_by(motif, seq_match_quantile_bin, interp_type, island_state) %>%
  dplyr::arrange(motif, seq_match_quantile_bin, interp_type, timepoint_num) %>%
  dplyr::mutate(med_val_diff = c(NA, diff(med_val))) %>%
  dplyr::filter(!is.na(med_val_diff))

slope_interp.plot<-ggplot(motifs_by_interp_summary_slope.df, aes(x = timepoint_num, y = med_val_diff, color = seq_match_quantile_bin, linetype = island_state))+
  geom_hline(yintercept = 0, linetype = 'dashed')+
  geom_point() + geom_line()+
  facet_grid(interp_type ~ motif, scales = 'free') + 
  scale_y_continuous(name = 'Median interpretation value (log-scale)')+
  scale_x_continuous(name = 'Transition INTO timepoint\n(i.e. 3 = 0h -> 3h')+
  ggtitle('Slope of interpretation values over time course')+
  theme_classic()+ theme(panel.border = element_rect(fill = NA))
slope_interp.plot

ggsave(paste0(figure_filepath, '/time_course_slope_loss_of_interp_values.png'), slope_interp.plot, height = 11, width = 8)
ggsave(paste0(figure_filepath, '/time_course_slope_loss_of_interp_values.pdf'), slope_interp.plot, height = 11, width = 8)

Run DESeq2

Collect DESeq2 normalized accessibility across regions

#Collect data
atac.bw.vec<-Sys.glob('../1_processing/bw/GSE174774_mesc_atac_*_*_cutsites.bw')
atac.df<-mclapply(atac.bw.vec, function(x){
  regionSums(regions.gr, 
             x)
}, mc.cores = 2) %>% as.data.frame()

#Format organization
atac_labels.df<-data.frame(filename = atac.bw.vec) %>%
  dplyr::mutate(state = basename(filename) %>% gsub('GSE174774_mesc_atac_', '', .) %>% gsub('_cutsites.bw', '', .)) %>%
  tidyr::separate(state, into = c('timepoint','replicate'), remove = F)

names(atac.df)<-atac_labels.df$state

#Run DESeq2
dds <- DESeqDataSetFromMatrix(countData = atac.df,
                              colData = atac_labels.df,
                              design = ~ timepoint)
dds <- DESeq(dds)

#Return enrichment results
log2FoldChange_atac.df<-lapply(paste0(seq(3, 15, 3), 'h'), function(x){
  res <- results(dds, contrast=c("timepoint",x, "0h"))
  res$log2FoldChange
}) %>% as.data.frame()
colnames(log2FoldChange_atac.df)<-paste0('deseq_', seq(3, 15, 3), 'h_vs_0h')
head(log2FoldChange_atac.df)
##   deseq_3h_vs_0h deseq_6h_vs_0h deseq_9h_vs_0h deseq_12h_vs_0h deseq_15h_vs_0h
## 1    -0.37090856   -0.153156033    0.102551357     -0.20443054    -0.108310560
## 2     0.24510469    0.273182746    0.001033166     -0.05862938     0.104027723
## 3    -0.40436795    0.227640844   -0.654836238     -0.60574621    -0.503051108
## 4     0.02648144   -0.004303579   -0.100683824      0.09601912    -0.002666202
## 5    -0.07957264   -0.097010013   -0.168614982      0.01571568    -0.088992381
## 6     0.10098966    0.072704187    0.143388257      0.13513098     0.468101161
#Return whether the region is significantly up- or down-regulated
padj_atac.df<-lapply(paste0(seq(3, 15, 3), 'h'), function(x){
  res <- results(dds, contrast=c("timepoint",x, "0h"))
  res$padj
}) %>% as.data.frame()
colnames(padj_atac.df)<-paste0('padj_', seq(3, 15, 3), 'h_vs_0h')
head(padj_atac.df)
##   padj_3h_vs_0h padj_6h_vs_0h padj_9h_vs_0h padj_12h_vs_0h padj_15h_vs_0h
## 1            NA            NA            NA             NA             NA
## 2            NA     0.3072666     0.9982259      0.8538062      0.7082605
## 3            NA            NA            NA             NA             NA
## 4            NA            NA            NA             NA             NA
## 5            NA            NA            NA             NA             NA
## 6            NA            NA            NA             NA             NA
regions_w_deseq.df<-cbind(regions.gr %>% as.data.frame, log2FoldChange_atac.df, padj_atac.df)

Collect DESeq2 normalized transcription across enhancers

#Collect data
erna.bw.vec<-Sys.glob('../1_processing/bw/GSE174774_mesc_ttseq_*_*.bw')
erna.df<-mclapply(erna.bw.vec, function(x){
  regionSums(regions.gr,
             x)
}, mc.cores = 2) %>% as.data.frame()

#Format organization
erna_labels.df<-data.frame(filename = erna.bw.vec) %>%
  dplyr::mutate(state = basename(filename) %>% gsub('GSE174774_mesc_ttseq_', '', .) %>% gsub('.bw', '', .)) %>%
  tidyr::separate(state, into = c('timepoint','replicate'), remove = F)
names(erna.df)<-erna_labels.df$state

#Run DESeq2
dds <- DESeqDataSetFromMatrix(countData = erna.df,
                              colData = erna_labels.df,
                              design = ~ timepoint)
dds <- DESeq(dds)

#Return enrichment results
log2FoldChange_erna.df<-lapply(paste0(seq(3, 15, 3), 'h'), function(x){
  res <- results(dds, contrast=c("timepoint",x, "0h"))
  res$log2FoldChange
}) %>% as.data.frame()
colnames(log2FoldChange_erna.df)<-paste0('ttseq_', seq(3, 15, 3), 'h_vs_0h')

#Return whether the region is significantly up- or down-regulated
padj_erna.df<-lapply(paste0(seq(3, 15, 3), 'h'), function(x){
  res <- results(dds, contrast=c("timepoint",x, "0h"))
  res$padj
}) %>% as.data.frame()
colnames(padj_erna.df)<-paste0('ttseq_padj_', seq(3, 15, 3), 'h_vs_0h')
head(padj_erna.df)
##   ttseq_padj_3h_vs_0h ttseq_padj_6h_vs_0h ttseq_padj_9h_vs_0h
## 1                  NA                  NA                  NA
## 2                   1           1.0000000          0.80728137
## 3                   1           0.2883047          0.04883932
## 4                   1           1.0000000          1.00000000
## 5                   1           1.0000000          1.00000000
## 6                   1           1.0000000          0.34543639
##   ttseq_padj_12h_vs_0h ttseq_padj_15h_vs_0h
## 1                   NA                   NA
## 2         0.7456910796         9.230833e-01
## 3         0.0000324912         9.705056e-10
## 4         0.8664747307         7.875962e-01
## 5         0.6129955366         7.716752e-01
## 6         0.6204651281         1.000000e+00
regions_w_deseq_both.df<-cbind(regions_w_deseq.df %>% as.data.frame, log2FoldChange_erna.df, padj_erna.df)
readr::write_tsv(regions_w_deseq_both.df, 'tsv/mapped_motifs/all_islands_curated_1based_w_deseq2_data.tsv.gz')

Collect DESeq2 normalized transcription across genes

Given published TT-seq data from Xiong et al (2022), assess pairwise differential expression across annotated regions.

#Collect data
ttseq.counts.path<-Sys.glob('../1_processing/bam/GSE174774_mesc_ttseq_*ReadsPerGene.out.tab')
ttseq_conditions.vec<-ttseq.counts.path %>% basename %>% gsub('GSE174774_mesc_ttseq_', '', .) %>% gsub('ReadsPerGene.out.tab', '', .)
ttseq_annotations.gr<-rtracklayer::import('../../public/databases/Ens/mm10.Ens_98.gtf')

#Import counts tables
ttseq_names.vec<-readr::read_tsv(ttseq.counts.path[1], F) %>% .[[1]]
ttseq.df<-mclapply(ttseq.counts.path, function(x){
  df<-readr::read_tsv(x, F)
  colnames(df)<-c('gene_name','reads','fwd','rev')
  df$reads
}, mc.cores = 2) %>% as.data.frame()
colnames(ttseq.df)<-ttseq_conditions.vec
ttseq.df$gene_id<-ttseq_names.vec
ttseq.df<-ttseq.df %>% dplyr::filter(!grepl('N_', gene_id))

#Filter out genes that have no reads across them
genes_to_keep.vec<-ttseq.df %>% tidyr::pivot_longer(cols = ttseq_conditions.vec, names_to = 'condition', values_to = 'counts') %>%
  dplyr::group_by(gene_id) %>% dplyr::summarize(total_counts = sum(counts)) %>%
  dplyr::filter(total_counts>0) %>%
  .$gene_id %>% unique()

ttseq.df<-ttseq.df %>% dplyr::filter(gene_id %in% genes_to_keep.vec)
  
#Attach annotations
ttseq_w_ann.df<-ttseq.df %>%
  dplyr::left_join(., ttseq_annotations.gr %>% as.data.frame %>% dplyr::select(gene_id, gene_name, gene_source)) %>%
  unique()
testit::assert(nrow(ttseq_w_ann.df)==nrow(ttseq.df))

#Format organization
ttseq_labels.df<-data.frame(filename = ttseq.counts.path) %>%
  dplyr::mutate(state = basename(filename) %>% basename %>% gsub('GSE174774_mesc_ttseq_', '', .) %>% gsub('ReadsPerGene.out.tab', '', .)) %>%
  tidyr::separate(state, into = c('timepoint','replicate'), remove = F)

#Run DESeq2
dds <- DESeqDataSetFromMatrix(countData = ttseq.df %>% dplyr::select(-gene_id),
                              colData = ttseq_labels.df,
                              design = ~ timepoint)
dds <- DESeq(dds)

#Return enrichment results
log2FoldChange_ttseq.df<-lapply(paste0(seq(3, 15, 3), 'h'), function(x){
  res <- results(dds, contrast=c("timepoint", x, "0h"))
  res$log2FoldChange
}) %>% as.data.frame()
colnames(log2FoldChange_ttseq.df)<-paste0('deseq_', seq(3, 15, 3), 'h_vs_0h')

#Return significance results
signif_ttseq.df<-lapply(paste0(seq(3, 15, 3), 'h'), function(x){
  res <- results(dds, contrast=c("timepoint", x, "0h"))
  res$padj
}) %>% as.data.frame()
colnames(signif_ttseq.df)<-paste0('signif_', seq(3, 15, 3), 'h_vs_0h')

Connect differential TT-seq values with annotations. Then connect gene locations with the nearest motif groups.

ttseq_w_deseq.df<-cbind(ttseq_w_ann.df, log2FoldChange_ttseq.df) %>%
  dplyr::left_join(., ttseq_annotations.gr %>% as.data.frame %>% dplyr::filter(type=='gene') %>% dplyr::select(gene_id, gene_name, gene_source, seqnames, start, end, strand))
readr::write_tsv(ttseq_w_deseq.df, 'tsv/mapped_motifs/all_genes_curated_1based_w_deseq2_data.tsv.gz')

#Collect significance
ttseq_w_signif.df<-cbind(ttseq_w_ann.df, signif_ttseq.df, log2FoldChange_ttseq.df) %>%
  dplyr::left_join(., ttseq_annotations.gr %>% as.data.frame %>% dplyr::filter(type=='gene') %>% dplyr::select(gene_id, gene_name, gene_source, seqnames, start, end, strand))

Read in differential data

ttseq_w_deseq.df<-readr::read_tsv('tsv/mapped_motifs/all_genes_curated_1based_w_deseq2_data.tsv.gz')
regions_w_deseq_both.df<-readr::read_tsv('tsv/mapped_motifs/all_islands_curated_1based_w_deseq2_data.tsv.gz')

Correlate SHAP with change in region

Calculate change in contribution with change in accessibility.

contrib_vs_acc.vec<-mclapply(1:nrow(shap.df), function(x){
  cor(shap.df[x,] %>% as.numeric(), norm.atac.df[motifs.df[x,]$region_id,] %>% as.numeric())
}, mc.cores = 8) %>% unlist()
motifs.df$cor_contrib_vs_acc<-contrib_vs_acc.vec

Plot SHAP scores of different concentrations across mapped motifs

cwms.plot.list<-lapply(names(acc_motif_to_task.list), function(x){
  message(x)
  logos.plot.list<-mclapply(names(shap.bw.list), function(y){
    message(y)
      gr<-motifs.gr %>% plyranges::filter(motif==x)
      seqs<-gr %>% getSeq(bsgenome, ., as.character = T)
      contrib.mat<-standard_metapeak_matrix(regions.gr = gr,
                                            sample.cov = shap.bw.list[[y]],
                                            keep_region_coordinates = T)
      seqs_1he.list<-lapply(seqs, function(z) one_hot_encode_DNA(z))
      contribs.list<-lapply(1:dim(contrib.mat)[1], function(z){
        seqs_1he.list[[z]] * matrix(rep(contrib.mat[z,], 4), nrow = 4, byrow = T)
      })
      cwm<-Reduce("+", contribs.list) / length(contribs.list)
      
      cwm.plot<-ggseqlogo(cwm, method = 'custom', seq_type='dna')+
        scale_x_continuous(name = 'Motif position (bp)')+
        scale_y_continuous(name = 'Contrib', limits = c(-.005, .025))+
        ggtitle(paste0(x, '_', y)) + 
        theme(axis.line.y = element_line(), axis.ticks.y = element_line(),
              axis.title = element_blank(), axis.text.x = element_blank())
    return(cwm.plot)
  }, mc.cores = 2)
  
  logos.plot<-wrap_plots(logos.plot.list) + plot_layout(ncol = 1)
  ggsave(paste0(figure_filepath, '/cwm_across_mapped_', x, '.png'), logos.plot, height = 9, width = 3)
  ggsave(paste0(figure_filepath, '/cwm_across_mapped_', x, '.pdf'), logos.plot, height = 9, width = 3)
  
  return(logos.plot)
}) 

all_logos.plot<-wrap_plots(cwms.plot.list) + plot_layout(nrow = 1)
all_logos.plot

ggsave(paste0(figure_filepath, '/cwm_across_mapped_all.png'), all_logos.plot, height = 11, width = 12)
ggsave(paste0(figure_filepath, '/cwm_across_mapped_all.pdf'), all_logos.plot, height = 11, width = 12)

Showcase low-affinity OS enhancer

curated_region_id<-86159
curated_coords<-c(85539625, 85539700)
curated_enhancer_boundaries.gr<-GRanges('chr10', IRanges(curated_coords[1], curated_coords[2]))

Generate contribution scores of mapped motifs.

curated_shap.plot<-mclapply(names(shap.bw.list), function(x){
  df<-data.frame(contrib = standard_metapeak_matrix(regions.gr = curated_enhancer_boundaries.gr, 
                                                sample.cov = shap.bw.list[[x]], 
                                                keep_region_coordinates = T) %>% 
               t()) %>% 
    dplyr::mutate(contrib_type = x, 
                  seq = getSeq(bsgenome, curated_enhancer_boundaries.gr, as.character = T) %>% stringr::str_split_1(pattern = ''),
                  position = start(curated_enhancer_boundaries.gr):end(curated_enhancer_boundaries.gr))
  
  # Generate sequence logo
  acc_contrib.plot<-df%>%
    tidyr::pivot_wider(names_from = seq, values_from = contrib) %>% 
    dplyr::select(A, C, G, T) %>% as.matrix() %>% t() %>%
    ggseqlogo(., method='custom', seq_type='dna') + 
    scale_y_continuous(name = 'Acc. contribution.', limits = c(-0.02, 0.08))+
    scale_x_discrete(name = 'Genomic position (chr10, bp)')+
    ggtitle(x)
  return(acc_contrib.plot)
}, mc.cores = 2) %>% wrap_plots(ncol = 1)

curated_shap.plot

ggsave(paste0(figure_filepath, '/OSlowaff_candidate_contrib.png'), curated_shap.plot, height = 6, width = 4)
ggsave(paste0(figure_filepath, '/OSlowaff_candidate_contrib.pdf'), curated_shap.plot, height = 6, width = 4)

For this subset, show perturbation effects as well.

region_start_0based<-regions.gr %>% as.data.frame %>% dplyr::filter(region_id == curated_region_id) %>% .$start
perturbs.df<-motifs.df %>% dplyr::filter(region_id == curated_region_id)

Plot metaplots across different concentrations

Motif frequencies and mappings

grouped.plot<-lapply(c('Oct4-Sox2', 'Sox2'), function(x){
  message(x)
  
  #Import motifs
  motif.df<-motifs_w_perturb.df %>% dplyr::filter(motif==x) %>%
    dplyr::arrange(`perturb_0h`) %>%
    dplyr::mutate(row = 1:dplyr::n())
  
  #Plot motif sequences
  seq.plot<-plot_sequence(motif.df %>% .$seq, window = 17, title = x, genome = bsgenome, show = F)
  
  #Format marginalization scores
  marg.df<-motif.df %>% 
    dplyr::select(paste0('marg_', names(shap.bw.list)), seq, motif, seq_match_quantile, row) %>%
    tidyr::pivot_longer(cols = paste0('marg_', names(shap.bw.list)), names_to = 'timepoint', values_to = 'marg') %>%
    dplyr::mutate(timepoint = timepoint %>% gsub('marg_', '', .) %>% factor(., levels = names(shap.bw.list)))
  
  #Plot marginalizations across motifs
  marg.plot<-ggplot(marg.df, aes(x = timepoint, y = row, fill = marg))+
    ggrastr::geom_tile_rast()+
    scale_x_discrete(expand = c(0,0))+
    scale_y_continuous(expand = c(0,0))+
    scale_fill_gradient2(high = '#b2182b', mid = 'white', low = '#2166ac', midpoint = 0, name = 'Marg. score')+
    #limits = c(min(perturb.df$perturb), max(perturb.df$perturb)))+
    ggtitle(x)+
    theme_classic()
  
  #Format perturbation scores (with marginalization scores)
  perturb.df<-motif.df %>% 
    dplyr::select(paste0('perturb_', names(shap.bw.list)), seq, motif, seq_match_quantile, row) %>%
    tidyr::pivot_longer(cols = paste0('perturb_', names(shap.bw.list)), names_to = 'timepoint', values_to = 'perturb') %>%
    dplyr::mutate(timepoint = timepoint %>% gsub('perturb_', '', .) %>% factor(., levels = names(shap.bw.list)))
  
  #Plot perturbation scores
  perturb.plot<-ggplot(perturb.df, aes(x = timepoint, y = row, fill = perturb))+
    ggrastr::geom_tile_rast()+
    scale_x_discrete(expand = c(0,0))+
    scale_y_continuous(expand = c(0,0))+
    scale_fill_gradient2(high = '#b2182b', mid = 'white', low = '#2166ac', midpoint = 0, name = 'Perturb score',
                         limits = c(min(perturb.df$perturb), max(perturb.df$perturb)))+
    ggtitle(x)+
    theme_classic()  
  
  
  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####
  #Compare context/isolation based on whether it is grouped or isolated
    
  #Collect islands belonging to each motif
  if(x=='Oct4-Sox2'){
    motifs_w_grouped.df<-perturb.df %>% 
      dplyr::left_join(motif.df %>% dplyr::select(row, region_id, seq_match_quantile_bin)) %>% 
      dplyr::left_join(regions_w_deseq.df %>% dplyr::select(region_id, island_count, island_content, atac_obs_q_bin, atac_obs)) %>%
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      dplyr::filter(grepl('Oct4-Sox2:1', island_content))
  } else{
    motifs_w_grouped.df<-perturb.df %>% 
      dplyr::left_join(motif.df %>% dplyr::select(row, region_id, seq_match_quantile_bin)) %>% 
      dplyr::left_join(regions_w_deseq.df %>% dplyr::select(region_id, island_count, island_content, atac_obs_q_bin, atac_obs)) %>%
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      dplyr::filter(!grepl('Oct4-Sox2', island_content))
  } 
  
  grouped_medians.df<-motifs_w_grouped.df %>% as.data.frame %>%
    
    #Remove all poor accessibility regions because we want to consider only open features. 
    #This cutoff was determined from absolute observed accessibility effects in later plots. See similar results despite changing cutoffs. 
    dplyr::filter(atac_obs_q_bin>=.6) %>% 
    dplyr::left_join(marg.df) %>%
    dplyr::group_by(island_state, timepoint, seq_match_quantile_bin) %>%
    dplyr::summarize(med_perturb = median(perturb),
                     med_marg = median(marg),
                     med_max = max(med_perturb, med_marg),
                     med_affinity = median(seq_match_quantile)) %>%
    dplyr::mutate(timepoint_numeric = timepoint %>% gsub('h', '', .) %>% as.integer(),
                  timepoint = factor(timepoint, levels = paste0(seq(0, 15, 3), 'h')))

  all_categories_bar.plot<-ggplot(grouped_medians.df)+
    geom_bar(aes(x = seq_match_quantile_bin, y = med_perturb, fill = timepoint), alpha = .5, stat = 'identity', position = 'dodge')+
    geom_bar(aes(x = seq_match_quantile_bin, y = med_marg, fill = timepoint), alpha = 1, color = 'black', linewidth = 1, stat = 'identity', position = 'dodge')+
    facet_grid(island_state ~ .)+
    scale_y_continuous(name = 'Context and isolation scores over group medians')+
    scale_x_discrete(name = 'Motif PWM group')+
    ggtitle('Context and isolation scores overlaid\nContext scores: transparent box\nIsolation scores:box with black outline')+
    theme_classic()
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_context_and_isolation_scores.png'), all_categories_bar.plot, height = 10, width = 8)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_context_and_isolation_scores.pdf'), all_categories_bar.plot, height = 10, width = 8)
  
  all_categories_line.plot<-ggplot(grouped_medians.df)+
    geom_line(aes(x = timepoint_numeric, y = med_perturb, color = seq_match_quantile_bin, linetype = island_state), color = 'black')+
    geom_line(aes(x = timepoint_numeric, y = med_marg, color = seq_match_quantile_bin, linetype = island_state), color = 'gray')+
    facet_grid(. ~ seq_match_quantile_bin)+
    scale_y_continuous(name = 'Context and isolation scores over group medians')+
    scale_x_continuous(name = 'Motif PWM group')+
    scale_linetype_manual(values = c('solid', 'dashed'))+
    ggtitle('Context and isolation scores overlaid\nContext scores: black\nIsolation scores:gray')+
    theme_classic()
  all_categories_line.plot
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_context_and_isolation_scores_line.png'), all_categories_line.plot, height = 6, width = 12)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_context_and_isolation_scores_line.pdf'), all_categories_line.plot, height = 6, width = 12)
  
  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####
  #Mark whether its mapped by each motif by context
  motif.gr<-makeGRangesFromDataFrame(motif.df, keep.extra.columns = T, starts.in.df.are.0based = F,
                                     seqnames.field = 'seqnames', start.field = 'start',
                                     strand.field = 'strand', end.field = 'end')

  p.list<-os_patterns.list[[x]]
  mapped.df<-mclapply(names(p.list), function(y){
    hits.gr<-readr::read_tsv(paste0('modiscolite/', y, '_fold1_atac_counts/hits.tsv')) %>% 
      makeGRangesFromDataFrame(keep.extra.columns = T, starts.in.df.are.0based = T) %>%
      dplyr::filter(pattern_name %in% p.list[[y]], metacluster_name=='pos_patterns')
    overlapsAny(motif.gr, hits.gr, ignore.strand = T)
  }, mc.cores = 2) %>% as.data.frame()
  colnames(mapped.df)<-names(p.list)
  mapped.df$row<-motif.gr$row
  
  mapped_by_contrib.plot<-mapped.df %>%
    tidyr::pivot_longer(names_to = 'timepoint', values_to = 'overlap', cols = names(p.list)) %>% 
    dplyr::mutate(timepoint = factor(timepoint, levels = names(p.list))) %>% 
    ggplot(., aes(x = timepoint, y = row, fill = overlap))+
    ggrastr::geom_tile_rast()+
    scale_fill_manual(values = c('white', 'black'), expand = c(0,0))+
    scale_x_discrete(expand = c(0,0))+
    scale_y_continuous(expand = c(0,0))+
    ggtitle(x, subtitle = 'Context ordered')+
    theme_classic()
  
  seq_match_quantile_by_contrib.plot<-ggplot(motif.df, aes(x = 1, y = row, fill = seq_match_quantile))+
    ggrastr::geom_tile_rast()+
    scale_x_discrete(expand = c(0,0))+
    scale_y_continuous(expand = c(0,0))+
    ggtitle(x, subtitle = 'Context ordered')+
    scale_fill_gradient(high = '#810f7c', low = 'white', name = 'seq_match_quantile')
  
  
  #Mark whether its mapped by each motif by isolation
  motif.gr<-makeGRangesFromDataFrame(motif.df, keep.extra.columns = T, starts.in.df.are.0based = F,
                                     seqnames.field = 'seqnames', start.field = 'start',
                                     strand.field = 'strand', end.field = 'end') %>%
    plyranges::arrange(`marg_0h`) %>%
    plyranges::mutate(row = 1:length(.))
  
  p.list<-os_patterns.list[[x]]
  mapped.df<-mclapply(names(p.list), function(y){
    hits.gr<-readr::read_tsv(paste0('modiscolite/', y, '_fold1_atac_counts/hits.tsv')) %>% 
      makeGRangesFromDataFrame(keep.extra.columns = T, starts.in.df.are.0based = T) %>%
      dplyr::filter(pattern_name %in% p.list[[y]], metacluster_name=='pos_patterns')
    overlapsAny(motif.gr, hits.gr, ignore.strand = T)
  }, mc.cores = 2) %>% as.data.frame()
  colnames(mapped.df)<-names(p.list)
  mapped.df$row<-motif.gr$row
  
  mapped_by_isolation.plot<-mapped.df %>%
    tidyr::pivot_longer(names_to = 'timepoint', values_to = 'overlap', cols = names(p.list)) %>% 
    dplyr::mutate(timepoint = factor(timepoint, levels = names(p.list))) %>% 
    ggplot(., aes(x = timepoint, y = row, fill = overlap))+
    ggrastr::geom_tile_rast()+
    scale_fill_manual(values = c('white', 'black'), expand = c(0,0))+
    scale_x_discrete(expand = c(0,0))+
    scale_y_continuous(expand = c(0,0))+
    ggtitle(x, subtitle = 'Isolation ordered')+
    theme_classic()
  
  seq_match_quantile_by_isolation.plot<-ggplot(motif.gr %>% as.data.frame, aes(x = 1, y = row, fill = seq_match_quantile))+
    ggrastr::geom_tile_rast()+
    scale_x_discrete(expand = c(0,0))+
    scale_y_continuous(expand = c(0,0))+
    ggtitle(x, subtitle = 'Isolation ordered')+
    scale_fill_gradient(high = '#810f7c', low = 'white', name = 'seq_match_quantile')

  #Save plots as a summary plot
  metaplot<-seq_match_quantile_by_contrib.plot + mapped_by_contrib.plot + seq_match_quantile_by_isolation.plot + mapped_by_isolation.plot + plot_layout(nrow = 1)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_mapping_consistency_heatmaps.png'), metaplot, height = 5, width = 16)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_mapping_consistency_heatmaps.pdf'), metaplot, height = 5, width = 16)

  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####
  #Measure effects of accessibility levels from DESeq2
  atac_comparisons<-colnames(log2FoldChange_atac.df)
  obs.df<-motif.df %>% dplyr::select(motif, region_id, row) %>%
    dplyr::left_join(., regions_w_deseq.df %>% dplyr::select(atac_comparisons, region_id)) %>%
    dplyr::select(atac_comparisons, row) %>%
    tidyr::pivot_longer(cols = atac_comparisons,
                        names_to = 'timepoint', values_to = 'deseq_atac') %>%
    dplyr::mutate(timepoint = timepoint %>% factor(., levels = c(atac_comparisons)))
  
  #Plot ALL effects of accessibility levels across motifs
  obs.plot<-ggplot(obs.df, aes(x = timepoint, y = row, fill = deseq_atac))+ #log(`0h_diff_timepoint`)))+
    ggrastr::geom_tile_rast()+
    scale_x_discrete(expand = c(0,0))+
    scale_y_continuous(expand = c(0,0))+
    scale_fill_gradient2(low = '#810f7c', mid = 'white', high = '#feb24c', midpoint = 0, name = 'log(timepoint/WT)')+
    ggtitle(x)+
    theme_classic()

  #Save plots as a summary plot
  metaplot<-seq.plot$plot + seq_match_quantile_by_contrib.plot + mapped_by_contrib.plot + marg.plot + perturb.plot + obs.plot + plot_layout(nrow =1)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_heatmaps.png'), metaplot, height = 5, width = 20)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_heatmaps.pdf'), metaplot, height = 5, width = 20)
  
  # Plot rate of mappings
  counts.plot<-mapped.df %>% data.table %>% melt.data.table(id.vars = 'row', variable.name = 'timepoint', value.name = 'bool') %>%
    dplyr::group_by(timepoint) %>% 
    dplyr::summarize(count = sum(bool)) %>%
    dplyr::mutate(total_count = nrow(mapped.df),
                  rate = count / total_count) %>%
    ggplot(., aes(x = timepoint))+
    geom_text(aes(y = rate + .1, label = count))+
    scale_y_continuous(name = 'Overlap rate', breaks = c(0, .5, 1))+
    geom_bar(aes(y = rate), stat = 'identity')+theme_classic()
  
  #Group by affinity and islands and see if there are differences
  my_comparisons<-list(c(1, 2), c(1, 6))
  affinity_vs_grouped.plot<-motif.df %>% dplyr::select(row, region_id, seq_match_quantile) %>% 
    dplyr::left_join(regions_w_deseq.df %>% dplyr::select(region_id, island_count)) %>%
    dplyr::filter(island_count<=6) %>%
    dplyr::group_by(island_count) %>% dplyr::mutate(med_seq_match_quantile = median(seq_match_quantile)) %>%
    ggplot(., aes(x = island_count, y = seq_match_quantile, group = island_count, fill = med_seq_match_quantile)) +
    geom_boxplot()+
    ggpubr::stat_compare_means(comparisons = my_comparisons, method = 't.test')+
    scale_y_continuous(name = 'Seq match')+
    scale_fill_gradient(high = '#810f7c', low = 'white', limits = c(.55, .75))+
    theme_classic()

  metaplot<-counts.plot + affinity_vs_grouped.plot
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_freqs.png'), metaplot, height = 5, width = 10)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_freqs.pdf'), metaplot, height = 5, width = 10)
}) 

Motif contributions

grouped.plot<-lapply(c('Oct4-Sox2', 'Sox2'), function(x){
  
  #Import motifs
  motif.df<-motifs_w_perturb.df %>% dplyr::filter(motif==x) %>%
    dplyr::arrange(contrib_0h) %>%
    dplyr::mutate(row = 1:dplyr::n())
  
  # Plot CWMs of actual mappings
  o.list<-os_orientation.list[[x]]
  p.list<-os_patterns.list[[x]]
  logos.plot.list<-mclapply(names(shap.bw.list), function(y){
    message(y)
    gr<-readr::read_tsv(paste0('modiscolite/atac_', y, '_fold1_atac_counts/hits.tsv')) %>% 
                        makeGRangesFromDataFrame(keep.extra.columns = T, starts.in.df.are.0based = T) %>%
                        dplyr::filter(pattern_name %in% p.list[[paste0('atac_', y)]][1], metacluster_name=='pos_patterns')
    if(length(gr)>0){
      seqs<-gr %>% getSeq(bsgenome, ., as.character = T)
      if(o.list[paste0('atac_', y)]=='-'){
        seqs<-DNAStringSet(seqs) %>% reverseComplement(.) %>% as.character(.)
        gr<-gr %>% plyranges::mutate(strand = ifelse(strand=='+', '-', '+'))
      }
      contrib.mat<-standard_metapeak_matrix(regions.gr = gr,
                                          sample.cov = shap.bw.list[[y]],
                                          keep_region_coordinates = T)
    seqs_1he.list<-lapply(seqs, function(z) one_hot_encode_DNA(z))
    contribs.list<-lapply(1:dim(contrib.mat)[1], function(z){
      seqs_1he.list[[z]] * matrix(rep(contrib.mat[z,], 4), nrow = 4, byrow = T)
    })
    cwm<-Reduce("+", contribs.list) / length(contribs.list)
    
    cwm.plot<-ggseqlogo(cwm, method = 'custom', seq_type='dna')+
      scale_x_continuous(name = 'Motif position (bp)')+
      scale_y_continuous(name = 'Contrib', limits = c(-.005, .035))+
      ggtitle(paste0(x, '_', y)) + 
      theme(axis.line.y = element_line(), axis.ticks.y = element_line(),
            axis.title = element_blank(), axis.text.x = element_blank())
    } else {cwm.plot<-ggplot()}
    
    return(cwm.plot)
  }, mc.cores = 2)
  
  logos.plot<-wrap_plots(logos.plot.list) + plot_layout(ncol = 1)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_contrib.png'), logos.plot, height = 9, width = 3)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_contrib.pdf'), logos.plot, height = 9, width = 3)
  return(NULL)
}) 

Differential accessibility

Observed data (binned by affinity and arrangements)

grouped.plot<-lapply(c('Oct4-Sox2', 'Sox2'), function(x){

  #Import motifs
  motif.df<-motifs_w_perturb.df %>% dplyr::filter(motif==x) %>%
    dplyr::arrange(contrib_0h) %>%
    dplyr::mutate(row = 1:dplyr::n())

  #Measure effects of accessibility levels from DESeq2
  atac_comparisons<-colnames(log2FoldChange_atac.df)
  obs.df<-motif.df %>% dplyr::select(motif, region_id, row) %>%
    dplyr::left_join(., regions_w_deseq.df %>% dplyr::select(atac_comparisons, region_id, atac_obs_q_bin)) %>%
    dplyr::select(atac_comparisons, row) %>%
    tidyr::pivot_longer(cols = atac_comparisons,
                        names_to = 'timepoint', values_to = 'deseq_atac') %>%
    dplyr::mutate(timepoint = timepoint %>% factor(., levels = c(atac_comparisons)))

   #Collect islands belonging to each motif
  if(x=='Oct4-Sox2'){
    grouped.df<-obs.df %>% 
      dplyr::left_join(motif.df %>% dplyr::select(row, region_id, seq_match_quantile_bin, seq_match_quantile)) %>% 
      dplyr::left_join(regions_w_deseq.df %>% dplyr::select(region_id, island_count, island_content, atac_obs_q_bin, atac_obs)) %>%
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      dplyr::filter(grepl('Oct4-Sox2:1', island_content))
  } else{
    grouped.df<-obs.df %>% 
      dplyr::left_join(motif.df %>% dplyr::select(row, region_id, seq_match_quantile_bin, seq_match_quantile)) %>% 
      dplyr::left_join(regions_w_deseq.df %>% dplyr::select(region_id, island_count, island_content, atac_obs_q_bin, atac_obs)) %>%
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      dplyr::filter(!grepl('Oct4-Sox2', island_content))
  }
  
  motifs_w_grouped.df<-motif.df %>% dplyr::select(motif, region_id, row, seq_match_quantile_bin, seq_match_quantile) %>%
    dplyr::left_join(., regions_w_deseq.df %>% dplyr::select(atac_comparisons, region_id, atac_obs_q_bin, atac_obs_q, atac_obs)) %>%
    dplyr::left_join(grouped.df %>% dplyr::select(row, island_state) %>% unique()) %>% 
    dplyr::filter(atac_obs_q_bin>=.6) %>%
    dplyr::filter(!is.na(island_state))
  
  grouped.df<-grouped.df %>% dplyr::filter(atac_obs_q_bin>=.6)

  #Plot distributions of isolated or grouped
  abs_distributions.plot<-motifs_w_grouped.df %>% as.data.frame %>%
    ggplot(.,aes(x = log(atac_obs), fill = island_state))+
    geom_density(alpha = .5)+
    scale_x_continuous(name = 'Abs accessibility')+# limits = c(5, 7.5))+
    scale_y_continuous(name = 'Density')+
    scale_fill_manual(values = c('#b2182b', '#2166ac'))+
    ggtitle('Abs accessibiltiy')+
    theme_classic()
  abs_distributions.plot
  ggsave(paste0(figure_filepath, '/motif_role_over_showcased_groups_', x, '_abs_accessibility_distributions.png'), abs_distributions.plot, height = 4, width = 6)
  ggsave(paste0(figure_filepath, '/motif_role_over_showcased_groups_', x, '_abs_accessibility_distributions.pdf'), abs_distributions.plot, height = 4, width = 6)
  print(wilcox.test(motifs_w_grouped.df %>% dplyr::filter(island_state=='isolated') %>% .$atac_obs %>% log, grouped.df %>% dplyr::filter(island_state=='grouped') %>% .$atac_obs %>% log))
  
  #Plot absolute accessibility over categories
  my_comparisons<-list(c('smq_1','smq_3'))
  all_abs_acc.plot<-motifs_w_grouped.df %>% as.data.frame %>%
    ggplot(.,aes(x = seq_match_quantile_bin, y = log(atac_obs)))+
    geom_boxplot(outliers = F)+
    ggpubr::stat_compare_means(comparisons = my_comparisons, method = 'wilcox.test')+
    scale_y_continuous(name = 'Abs accessibility')+# limits = c(5, 7.5))+
    scale_x_discrete(name = 'Motif PWM group')+
    facet_grid(. ~ island_state)+
    ggtitle('Abs accessibiltiy')+
    theme_classic()
  all_abs_acc.plot
  ggsave(paste0(figure_filepath, '/motif_role_over_showcased_groups_', x, '_abs_accessibility.png'), all_abs_acc.plot, height = 4, width = 6)
  ggsave(paste0(figure_filepath, '/motif_role_over_showcased_groups_', x, '_abs_accessibility.pdf'), all_abs_acc.plot, height = 4, width = 6)

  #Plot  affinities over categories
  all_affinities.plot<-motifs_w_grouped.df %>% as.data.frame %>%
    
    #Remove all poor accessibility regions because we want to consider only open features. 
    #This cutoff was determined from absolute observed accessibility effects in later plots. See similar results despite changing cutoffs. 
    dplyr::filter(atac_obs_q_bin>=.6) %>%
    ggplot(.,aes(x = seq_match_quantile_bin, y = seq_match_quantile, fill = island_state))+
    geom_boxplot()+
    ggpubr::stat_compare_means(method = 'wilcox.test')+
    scale_y_continuous(name = 'PWM scores')+
    scale_x_discrete(name = 'Motif PWM group')+
    ggtitle('Affinities')+
    theme_classic()
  all_affinities.plot
  ggsave(paste0(figure_filepath, '/motif_role_over_showcased_groups_', x, '_affinities.png'), all_affinities.plot, height = 4, width = 6)
  ggsave(paste0(figure_filepath, '/motif_role_over_showcased_groups_', x, '_affinities.pdf'), all_affinities.plot, height = 4, width = 6)

  #Group and summarize and plot motifs by boolean (grouped/ungrouped) and again by affinity group
  grouped_deseq.plot<-grouped.df %>%
    dplyr::group_by(island_state, timepoint) %>%
    dplyr::summarize(med_deseq_atac = median(deseq_atac, na.rm = T)) %>%
    ggplot(., aes(x = timepoint, y = med_deseq_atac, color = island_state, group = interaction(island_state))) +
    # geom_point()+
    geom_line()+
    scale_x_discrete(name = 'Timepoint')+
    scale_y_continuous(name = 'Observed Log2FC (mut/0h)')+
    theme_classic()
  
  affinity_deseq.plot<-grouped.df %>%
    dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
    dplyr::group_by(island_state, timepoint, seq_match_quantile_bin) %>%
    dplyr::summarize(med_deseq_atac = median(deseq_atac, na.rm = T)) %>%
    ggplot(., aes(x = timepoint, y = med_deseq_atac, linetype = island_state, color = seq_match_quantile_bin, group = interaction(island_state, seq_match_quantile_bin))) +
    # geom_point()+
    geom_line()+
    scale_x_discrete(name = 'Timepoint')+
    scale_y_continuous(name = 'Observed Log2FC (mut/0h)')+
    theme_classic()
  
  if(x=='Sox2'){affinity_deseq.plot<-affinity_deseq.plot + scale_y_continuous(name = 'Observed Log2FC (mut/0h)', limits = c(-.5, .2))}
  
  #Save plots as a summary plot
  metaplot<-grouped_deseq.plot + affinity_deseq.plot
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_diff_accessibility_summary.png'), metaplot, height = 5, width = 10)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_diff_accessibility_summary.pdf'), metaplot, height = 5, width = 10)

  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####  #####
  
  #Group by number of islands, affinity and accessibility (basal)
  count_deseq_by_acc_and_affinity.plot<-grouped.df %>%
    dplyr::left_join(., regions.gr %>% as.data.frame %>% dplyr::select(region_id, atac_obs_q_bin)) %>%
    dplyr::group_by(island_state, timepoint, atac_obs_q_bin, seq_match_quantile_bin) %>%
    dplyr::summarize(med_deseq_atac = median(deseq_atac, na.rm = T), 
                     counts = dplyr::n()) %>%
    dplyr::filter(counts>= count_threshold) %>%
    ggplot(., aes(x = timepoint, y = med_deseq_atac, color = factor(island_state), group = interaction(island_state))) +
    geom_hline(yintercept = 0, linetype = 'dotted')+
    geom_point()+
    geom_line()+
    scale_x_discrete(name = 'Timepoint')+
    scale_y_continuous(name = 'Observed Log2FC (mut/0h)')+
    facet_grid(atac_obs_q_bin ~ seq_match_quantile_bin)+
    # scale_color_manual(values = viridis(2))+
    theme_classic()
  
  #Group by number of islands
  count_deseq.plot<-grouped.df %>%
    dplyr::left_join(., regions.gr %>% as.data.frame %>% dplyr::select(region_id, atac_obs_q_bin)) %>%
    dplyr::filter(island_count<=6) %>%
    dplyr::group_by(island_count, timepoint, atac_obs_q_bin) %>%
    dplyr::summarize(med_deseq_atac = median(deseq_atac, na.rm = T)) %>%
    ggplot(., aes(x = timepoint, y = med_deseq_atac, color = island_count, group = interaction(island_count))) +
    geom_point()+
    geom_line()+
    scale_x_discrete(name = 'Timepoint')+
    scale_y_continuous(name = 'Observed Log2FC (mut/0h)')+
    # scale_color_manual(values = turbo(6))+
    theme_classic()
  
  metaplot<-count_deseq_by_acc_and_affinity.plot + count_deseq.plot
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_diff_accessibility_expanded.png'), count_deseq_by_acc_and_affinity.plot, height = 10, width = 10)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_diff_accessibility_expanded.png'), count_deseq_by_acc_and_affinity.plot, height = 10, width = 10)

  print(grouped.df %>%
          dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
          dplyr::select(island_state, seq_match_quantile_bin, row) %>% unique(.) %>%
          dplyr::group_by(island_state, seq_match_quantile_bin) %>% dplyr::summarize(count = dplyr::n()))
  
  #Plot absolute values as well
  atac_timepoints<-names(norm.atac.bw.list)
  obs.df<-motif.df %>% dplyr::select(motif, region_id, row) %>%
    dplyr::left_join(., regions_w_atac.df %>% dplyr::select(atac_timepoints, region_id, atac_obs_q_bin)) %>%
    dplyr::select(atac_timepoints, row) %>%
    tidyr::pivot_longer(cols = atac_timepoints,
                        names_to = 'timepoint', values_to = 'atac_obs') %>%
    dplyr::mutate(timepoint = timepoint %>% factor(., levels = c(atac_timepoints)))

  #Collect islands belonging to each motif
  if(x=='Oct4-Sox2'){
    grouped.df<-obs.df %>% 
      dplyr::left_join(motif.df %>% dplyr::select(row, region_id, seq_match_quantile_bin)) %>% 
      dplyr::left_join(regions_w_deseq.df %>% dplyr::select(region_id, island_count, island_content)) %>%
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      dplyr::filter(grepl('Oct4-Sox2:1', island_content))
  } else{
    grouped.df<-obs.df %>% 
      dplyr::left_join(motif.df %>% dplyr::select(row, region_id, seq_match_quantile_bin)) %>% 
      dplyr::left_join(regions_w_deseq.df %>% dplyr::select(region_id, island_count, island_content)) %>%
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      dplyr::filter(!grepl('Oct4-Sox2', island_content))
  }
  
  #Group by number of islands, affinity and accessibility (basal)
  count_deseq_by_acc_and_affinity.plot<-grouped.df %>%
    dplyr::left_join(., regions.gr %>% as.data.frame %>% dplyr::select(region_id, atac_obs_q_bin)) %>%
    dplyr::group_by(island_state, timepoint, atac_obs_q_bin, seq_match_quantile_bin) %>%
    dplyr::summarize(med_atac_obs = median(atac_obs, na.rm = T), 
                     counts = dplyr::n()) %>%
    dplyr::filter(counts>= count_threshold) %>%
    ggplot(., aes(x = timepoint, y = med_atac_obs, color = factor(island_state), group = interaction(island_state))) +
    geom_hline(yintercept = 0, linetype = 'dotted')+
    geom_point()+
    geom_line()+
    scale_x_discrete(name = 'Timepoint')+
    scale_y_continuous(name = 'Observed RPM-norm ATAC-seq levels')+
    facet_grid(atac_obs_q_bin ~ seq_match_quantile_bin)+
    # scale_color_manual(values = viridis(2))+
    theme_classic()
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_obs_accessibility_expanded.png'), count_deseq_by_acc_and_affinity.plot, height = 10, width = 10)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_obs_accessibility_expanded.pdf'), count_deseq_by_acc_and_affinity.plot, height = 10, width = 10)
  
  return(NULL)
})
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  motifs_w_grouped.df %>% dplyr::filter(island_state == "isolated") %>% .$atac_obs %>% log and grouped.df %>% dplyr::filter(island_state == "grouped") %>% .$atac_obs %>% log
## W = 51258228, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## # A tibble: 6 × 3
## # Groups:   island_state [2]
##   island_state seq_match_quantile_bin count
##   <chr>        <fct>                  <int>
## 1 grouped      smq_1                   1785
## 2 grouped      smq_2                   1600
## 3 grouped      smq_3                   1552
## 4 isolated     smq_1                   1580
## 5 isolated     smq_2                   1750
## 6 isolated     smq_3                   1852
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  motifs_w_grouped.df %>% dplyr::filter(island_state == "isolated") %>% .$atac_obs %>% log and grouped.df %>% dplyr::filter(island_state == "grouped") %>% .$atac_obs %>% log
## W = 272271410, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## # A tibble: 6 × 3
## # Groups:   island_state [2]
##   island_state seq_match_quantile_bin count
##   <chr>        <fct>                  <int>
## 1 grouped      smq_1                   4849
## 2 grouped      smq_2                   4596
## 3 grouped      smq_3                   3774
## 4 isolated     smq_1                   3135
## 5 isolated     smq_2                   2994
## 6 isolated     smq_3                   3971

Validate which metric explains differential accessibility

Given (1) pwm scores, (2) perturbation scores and (3) marginalization scores, which best explains the differential rates of accessibility changes?

Collect motifs with readouts.

motifs_w_differential_endpoint.df<-motifs_w_perturb.df %>% dplyr::left_join(regions_w_deseq_both.df %>% dplyr::select(region_id, deseq_15h_vs_0h))

Do using a joint model of all parameters

residuals.df<-lapply(c('Oct4-Sox2','Sox2','Klf4'), function(x){
    model<-lm(formula = deseq_15h_vs_0h ~ seq_match_quantile + perturb_0h + marg_0h, data =  motifs_w_differential_endpoint.df %>% dplyr::filter(motif==x))
    af <- anova(model)
    afss <- af$"Sum Sq"
    df<-cbind(af,var_explained=afss/sum(afss)*100) %>%
      dplyr::mutate(motif=x)
    df$comparison<-rownames(df)
    return(df)
  }) %>% rbindlist() %>%
    dplyr::filter(comparison != 'Residuals') %>%
    dplyr::mutate(model = 'acc')
residuals.df
##       Df      Sum Sq     Mean Sq    F value        Pr(>F) var_explained
##    <int>       <num>       <num>      <num>         <num>         <num>
## 1:     1  33.1936841  33.1936841  117.58334  2.480607e-27    0.46751864
## 2:     1 355.5999062 355.5999062 1259.65599 7.319046e-269    5.00847042
## 3:     1  31.6946762  31.6946762  112.27334  3.563313e-26    0.44640576
## 4:     1  30.9451881  30.9451881  127.60777  1.488654e-29    0.25860485
## 5:     1 124.8357243 124.8357243  514.78143 2.243222e-113    1.04323566
## 6:     1   0.5586818   0.5586818    2.30382  1.290628e-01    0.00466883
## 7:     1 213.2044618 213.2044618 1479.26096 7.243002e-321    1.66780341
## 8:     1   5.6553603   5.6553603   39.23817  3.769179e-10    0.04423936
## 9:     1 286.7683917 286.7683917 1989.66420  0.000000e+00    2.24326122
##        motif         comparison  model
##       <char>             <char> <char>
## 1: Oct4-Sox2 seq_match_quantile    acc
## 2: Oct4-Sox2         perturb_0h    acc
## 3: Oct4-Sox2            marg_0h    acc
## 4:      Sox2 seq_match_quantile    acc
## 5:      Sox2         perturb_0h    acc
## 6:      Sox2            marg_0h    acc
## 7:      Klf4 seq_match_quantile    acc
## 8:      Klf4         perturb_0h    acc
## 9:      Klf4            marg_0h    acc

Run single variate regressions instead of multivariate. This may make more sense given the comparisons, so we will do both and make sure that the results don’t change the trends we observe.

variables<-c('seq_match_quantile', 'perturb_0h', 'marg_0h')

residuals_indp.df<-lapply(c('Oct4-Sox2','Sox2','Klf4'), function(x){
    lapply(variables, function(y){
      motifs_w_differential_endpoint.df$readout<-motifs_w_differential_endpoint.df[[y]]
      model<-lm(formula = deseq_15h_vs_0h ~ readout, data =  motifs_w_differential_endpoint.df %>% dplyr::filter( motif==x))
      af <- anova(model)
      afss <- af$"Sum Sq"
      df<-cbind(af,var_explained=afss/sum(afss)*100) %>%
        dplyr::mutate(motif=x)
      df$comparison<-y
      return(df)
    }) %>% rbindlist()
  }) %>% rbindlist() %>%
    dplyr::filter(!is.na(`F value`)) %>%
    dplyr::mutate(model = 'acc')
residuals_indp.df
##       Df       Sum Sq      Mean Sq      F value        Pr(>F) var_explained
##    <int>        <num>        <num>        <num>         <num>         <num>
## 1:     1  33.19368408  33.19368408  111.1485764  6.267278e-26  0.4675186411
## 2:     1 383.03286399 383.03286399 1349.3808626 2.422138e-287  5.3948517336
## 3:     1  27.76619634  27.76619634   92.9033588  6.025686e-22  0.3910748307
## 4:     1  30.94518814  30.94518814  126.2722804  2.912595e-29  0.2586048502
## 5:     1 115.65939586 115.65939586  475.3235663 7.077350e-105  0.9665502956
## 6:     1  12.54801651  12.54801651   51.1235573  8.794754e-13  0.1048621167
## 7:     1 213.20446176 213.20446176 1444.8828503 1.599941e-313  1.6678034076
## 8:     1   0.02376292   0.02376292    0.1583553  6.906762e-01  0.0001858867
## 9:     1 461.89475150 461.89475150 3193.4310094  0.000000e+00  3.6131966194
##        motif         comparison  model
##       <char>             <char> <char>
## 1: Oct4-Sox2 seq_match_quantile    acc
## 2: Oct4-Sox2         perturb_0h    acc
## 3: Oct4-Sox2            marg_0h    acc
## 4:      Sox2 seq_match_quantile    acc
## 5:      Sox2         perturb_0h    acc
## 6:      Sox2            marg_0h    acc
## 7:      Klf4 seq_match_quantile    acc
## 8:      Klf4         perturb_0h    acc
## 9:      Klf4            marg_0h    acc

Compare models run with independence or not (because I’m paranoid and my regression theory is very rusty)

residuals.df<-dplyr::left_join(residuals.df %>% dplyr::select(motif, var_explained, comparison, model), 
                 residuals_indp.df %>% dplyr::select(motif, var_explained, comparison, model) %>% dplyr::rename(var_explained_indp = var_explained)) %>%
  dplyr::mutate(diff_var_expl_comb_diff_indp = round(var_explained - var_explained_indp, 2)) %>%
  dplyr::arrange(desc(abs(diff_var_expl_comb_diff_indp)))

Yeah, they’re similar. Single variate models like distance even more than PWM scores most of the time. Glad that’s stable. Correlations also agree.

Visualize residuals

plot<-residuals.df %>%
  dplyr::filter(motif %in% c('Oct4-Sox2')) %>%
  ggplot(., aes(x = motif, fill = comparison, y = var_explained))+
  geom_bar(stat = 'identity', position = 'dodge')+
  facet_grid(. ~ model)+
  theme_classic() + theme(legend.position = 'bottom')
plot

ggsave(paste0(figure_filepath, '/perturb_vs_pwm_and_interpretation_methods.png'), plot, height = 2.5, width = 6.5)
ggsave(paste0(figure_filepath, '/perturb_vs_pwm_and_interpretation_methods.pdf'), plot, height = 2.5, width = 6.5)

Provide direct correlations between the two groups

motifs_w_differential_endpoint.df %>% dplyr::group_by(motif) %>%
  dplyr::summarize(diff_vs_perturb = cor(deseq_15h_vs_0h, perturb_0h, method = 'spearman'), 
                   diff_vs_pwm = cor(deseq_15h_vs_0h, seq_match_quantile, method = 'spearman'),
                   diff_vs_marg = cor(deseq_15h_vs_0h, marg_0h, method = 'spearman'))
## # A tibble: 4 × 4
##   motif     diff_vs_perturb diff_vs_pwm diff_vs_marg
##   <chr>               <dbl>       <dbl>        <dbl>
## 1 Ctcf             NA           NA           NA     
## 2 Klf4              0.00221      0.123        0.184 
## 3 Oct4-Sox2        -0.259       -0.0738      -0.0784
## 4 Sox2             -0.0745       0.0534       0.0282

Validate which metric explains differential accessibility

Given (1) pwm scores, (2) perturbation scores and (3) marginalization scores, which best explains the data at the endpoints?

Collect motifs with readouts.

motifs_w_absolute_endpoint.df<-motifs_w_perturb.df %>% dplyr::left_join(regions_w_deseq_both.df %>% dplyr::select(region_id, atac_obs))

Do using a joint model of all parameters

residuals.df<-lapply(c('Oct4-Sox2','Sox2','Klf4'), function(x){
    model<-lm(formula = atac_obs ~ seq_match_quantile + perturb_0h + marg_0h, data =  motifs_w_absolute_endpoint.df %>% dplyr::filter(motif==x))
    af <- anova(model)
    afss <- af$"Sum Sq"
    df<-cbind(af,var_explained=afss/sum(afss)*100) %>%
      dplyr::mutate(motif=x)
    df$comparison<-rownames(df)
    return(df)
  }) %>% rbindlist() %>%
    dplyr::filter(comparison != 'Residuals') %>%
    dplyr::mutate(model = 'acc')
residuals.df
##       Df      Sum Sq     Mean Sq     F value        Pr(>F) var_explained
##    <int>       <num>       <num>       <num>         <num>         <num>
## 1:     1   3164845.1   3164845.1   64.923880  8.149757e-16    0.26407809
## 2:     1  36773436.9  36773436.9  754.373169 1.646126e-163    3.06841524
## 3:     1   5109170.6   5109170.6  104.809925  1.512580e-24    0.42631470
## 4:     1  11013327.1  11013327.1  196.152627  1.761470e-44    0.39868671
## 5:     1  16720207.3  16720207.3  297.794896  1.571900e-66    0.60527799
## 6:     1    322447.6    322447.6    5.742946  1.655875e-02    0.01167273
## 7:     1 124240638.2 124240638.2  836.889201 3.989214e-183    0.94457743
## 8:     1   3774080.3   3774080.3   25.422335  4.614971e-07    0.02869360
## 9:     1 378562674.9 378562674.9 2550.011164  0.000000e+00    2.87813844
##        motif         comparison  model
##       <char>             <char> <char>
## 1: Oct4-Sox2 seq_match_quantile    acc
## 2: Oct4-Sox2         perturb_0h    acc
## 3: Oct4-Sox2            marg_0h    acc
## 4:      Sox2 seq_match_quantile    acc
## 5:      Sox2         perturb_0h    acc
## 6:      Sox2            marg_0h    acc
## 7:      Klf4 seq_match_quantile    acc
## 8:      Klf4         perturb_0h    acc
## 9:      Klf4            marg_0h    acc

Run single variate regressions instead of multivariate. This may make more sense given the comparisons, so we will do both and make sure that the results don’t change the trends we observe.

variables<-c('seq_match_quantile', 'perturb_0h', 'marg_0h')

residuals_indp.df<-lapply(c('Oct4-Sox2','Sox2','Klf4'), function(x){
    lapply(variables, function(y){
      motifs_w_absolute_endpoint.df$readout<-motifs_w_absolute_endpoint.df[[y]]
      model<-lm(formula = atac_obs ~ readout, data =  motifs_w_absolute_endpoint.df %>% dplyr::filter( motif==x))
      af <- anova(model)
      afss <- af$"Sum Sq"
      df<-cbind(af,var_explained=afss/sum(afss)*100) %>%
        dplyr::mutate(motif=x)
      df$comparison<-y
      return(df)
    }) %>% rbindlist()
  }) %>% rbindlist() %>%
    dplyr::filter(!is.na(`F value`)) %>%
    dplyr::mutate(model = 'acc')
residuals_indp.df
##       Df       Sum Sq      Mean Sq      F value        Pr(>F) var_explained
##    <int>        <num>        <num>        <num>         <num>         <num>
## 1:     1 3.164845e+06 3.164845e+06 6.265425e+01  2.571236e-15  2.640781e-01
## 2:     1 2.320947e+07 2.320947e+07 4.673132e+02 1.212515e-102  1.936623e+00
## 3:     1 1.374474e+06 1.374474e+06 2.716969e+01  1.879219e-07  1.146876e-01
## 4:     1 1.101333e+07 1.101333e+07 1.949456e+02  3.222868e-44  3.986867e-01
## 5:     1 1.478977e+07 1.478977e+07 2.621518e+02  8.304500e-59  5.353953e-01
## 6:     1 7.995380e+06 7.995380e+06 1.413702e+02  1.481441e-32  2.894358e-01
## 7:     1 1.242406e+08 1.242406e+08 8.123493e+02 7.672369e-178  9.445774e-01
## 8:     1 6.058876e+01 6.058876e+01 3.924185e-04  9.841953e-01  4.606446e-07
## 9:     1 4.257896e+08 4.257896e+08 2.849994e+03  0.000000e+00  3.237196e+00
##        motif         comparison  model
##       <char>             <char> <char>
## 1: Oct4-Sox2 seq_match_quantile    acc
## 2: Oct4-Sox2         perturb_0h    acc
## 3: Oct4-Sox2            marg_0h    acc
## 4:      Sox2 seq_match_quantile    acc
## 5:      Sox2         perturb_0h    acc
## 6:      Sox2            marg_0h    acc
## 7:      Klf4 seq_match_quantile    acc
## 8:      Klf4         perturb_0h    acc
## 9:      Klf4            marg_0h    acc

Compare models run with independence or not (because I’m paranoid and my regression theory is very rusty)

residuals.df<-dplyr::left_join(residuals.df %>% dplyr::select(motif, var_explained, comparison, model), 
                 residuals_indp.df %>% dplyr::select(motif, var_explained, comparison, model) %>% dplyr::rename(var_explained_indp = var_explained)) %>%
  dplyr::mutate(diff_var_expl_comb_diff_indp = round(var_explained - var_explained_indp, 2)) %>%
  dplyr::arrange(desc(abs(diff_var_expl_comb_diff_indp)))

Yeah, they’re similar. Single variate models like distance even more than PWM scores most of the time. Glad that’s stable. Correlations also agree.

Visualize residuals

plot<-residuals.df %>%
  dplyr::filter(motif %in% c('Oct4-Sox2', 'Sox2')) %>%
  ggplot(., aes(x = motif, fill = comparison, y = var_explained))+
  geom_bar(stat = 'identity', position = 'dodge')+
  facet_grid(. ~ model)+
  theme_classic() + theme(legend.position = 'bottom')
plot

ggsave(paste0(figure_filepath, '/perturb_vs_pwm_and_interpretation_methods_absolute.png'), plot, height = 2.5, width = 6.5)
ggsave(paste0(figure_filepath, '/perturb_vs_pwm_and_interpretation_methods_absolute.pdf'), plot, height = 2.5, width = 6.5)

Provide direct correlations between the two groups

motifs_w_differential_endpoint.df %>% dplyr::group_by(motif) %>%
  dplyr::summarize(diff_vs_perturb = cor(deseq_15h_vs_0h, perturb_0h, method = 'spearman'), 
                   diff_vs_pwm = cor(deseq_15h_vs_0h, seq_match_quantile, method = 'spearman'),
                   diff_vs_marg = cor(deseq_15h_vs_0h, marg_0h, method = 'spearman'))
## # A tibble: 4 × 4
##   motif     diff_vs_perturb diff_vs_pwm diff_vs_marg
##   <chr>               <dbl>       <dbl>        <dbl>
## 1 Ctcf             NA           NA           NA     
## 2 Klf4              0.00221      0.123        0.184 
## 3 Oct4-Sox2        -0.259       -0.0738      -0.0784
## 4 Sox2             -0.0745       0.0534       0.0282

Observed data (binned by context groups)

grouped.plot<-lapply(c('Oct4-Sox2', 'Sox2'), function(x){

  #Import motifs
  motif.df<-motifs_w_perturb.df %>% dplyr::filter(motif==x) %>%
    dplyr::arrange(contrib_0h) %>%
    dplyr::mutate(row = 1:dplyr::n(),
                  perturb_0h_quantile = ecdf(perturb_0h)(perturb_0h))

  #Measure effects of accessibility levels from DESeq2
  atac_comparisons<-paste0('deseq_', c('3h','6h','9h','12h','15h'), '_vs_0h')
  ttseq_comparisons<-paste0('ttseq_', c('3h','6h','9h','12h','15h'), '_vs_0h')
  obs.df<-motif.df %>% dplyr::select(motif, region_id, row) %>%
    dplyr::left_join(., regions_w_deseq_both.df %>% dplyr::select(atac_comparisons, ttseq_comparisons, region_id, atac_obs_q_bin)) %>%
    dplyr::select(atac_comparisons, ttseq_comparisons, row) %>%
    tidyr::pivot_longer(cols = c(atac_comparisons, ttseq_comparisons),
                        names_to = 'comparison', values_to = 'deseq') %>%
    dplyr::mutate(timepoint = gsub('deseq_', '', comparison) %>% gsub('ttseq_', '', .) %>% factor(., unique(.)),
                  experiment = ifelse(grepl('ttseq', comparison), 'ttseq', 'atacseq'))
  
  
  #Collect islands belonging to each motif
  if(x=='Oct4-Sox2'){
    grouped.df<-obs.df %>% 
      dplyr::left_join(motif.df %>% dplyr::select(row, region_id, seq_match_quantile_bin, seq_match_quantile, coopscore_0h, perturb_0h, marg_0h, perturb_0h_quantile)) %>% 
      dplyr::left_join(regions_w_deseq_both.df %>% dplyr::select(region_id, island_count, island_content, atac_obs, atac_obs_q_bin)) %>%
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      dplyr::filter(grepl('Oct4-Sox2:1', island_content))
    
  } else{
    grouped.df<-obs.df %>% 
      dplyr::left_join(motif.df %>% dplyr::select(row, region_id, seq_match_quantile_bin, seq_match_quantile, coopscore_0h, perturb_0h, marg_0h, perturb_0h_quantile)) %>% 
      dplyr::left_join(regions_w_deseq_both.df %>% dplyr::select(region_id, island_count, island_content, atac_obs, atac_obs_q_bin)) %>%
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      dplyr::filter(!grepl('Oct4-Sox2', island_content))
  }
  

  #Divvy things up by different modes of context, affinities, and responses.
  states.df<-rbind(
    grouped.df %>% dplyr::filter(seq_match_quantile<.2, perturb_0h_quantile>.8) %>% dplyr::mutate(custom_class = 'high context, low affinity'),
    grouped.df %>% dplyr::filter(seq_match_quantile>.8, perturb_0h_quantile>.8) %>% dplyr::mutate(custom_class = 'high context, high affinity'),
    grouped.df %>% dplyr::filter(abs(coopscore_0h)<=.05, seq_match_quantile>.8) %>% dplyr::mutate(custom_class = 'minimal context-isolation, high affinity'),
    grouped.df %>% dplyr::filter(abs(coopscore_0h)<=.05, seq_match_quantile<.2) %>% dplyr::mutate(custom_class = 'minimal context-isolation, low affinity')
  ) %>% dplyr::filter(atac_obs_q_bin>=.6)
  
  median_states_timepoint.plot<-states.df %>%
    dplyr::group_by(experiment, custom_class, timepoint) %>%
    dplyr::summarize(med_deseq = median(deseq, na.rm = T), freq = dplyr::n()) %>%
    ggplot(., aes(x = timepoint, y = med_deseq, color = custom_class, group = interaction(custom_class))) +
    geom_point()+
    geom_line()+
    scale_color_manual(name = 'Affinity', values = c('#A11E23', '#E8B4AF', 'black', 'gray'))+
    scale_x_discrete(name = 'Timepoint')+
    scale_y_continuous(name = 'Observed Log2FC (mut/0h)')+
    facet_grid(. ~ experiment)+
    ggtitle('Obs. diff. levels (timepoints, log)')+
    theme_classic()
  
  median_states_conc.plot<-states.df %>%
    dplyr::left_join(conc.df %>% dplyr::mutate(timepoint = paste0(treatment, '_vs_0h')) %>% dplyr::select(timepoint, Oct4)) %>%
    dplyr::group_by(experiment, custom_class, Oct4) %>%
    dplyr::summarize(med_deseq = median(deseq, na.rm = T), freq = dplyr::n()) %>%
    ggplot(., aes(x = Oct4, y = med_deseq, color = custom_class, group = interaction(custom_class))) +
    geom_point()+
    geom_line()+
    scale_color_manual(name = 'Affinity', values = c('#A11E23', '#E8B4AF', 'black', 'gray'))+
    scale_x_continuous(name = 'Relative Oct4 concentration', trans = 'reverse')+
    scale_y_continuous(name = 'Observed Log2FC (mut/0h)')+
    facet_grid(. ~ experiment)+
    ggtitle('Obs. diff. levels (conc, log)')+
    theme_classic()
  
  median_states_timepoint_exp.plot<-states.df %>%
    dplyr::group_by(experiment, custom_class, timepoint) %>%
    dplyr::summarize(med_deseq = median(2^deseq, na.rm = T), freq = dplyr::n()) %>%
    ggplot(., aes(x = timepoint, y = med_deseq, color = custom_class, group = interaction(custom_class))) +
    geom_point()+
    geom_line()+
    scale_color_manual(name = 'Affinity', values = c('#A11E23', '#E8B4AF', 'black', 'gray'))+
    scale_x_discrete(name = 'Timepoint')+
    scale_y_continuous(name = 'Observed FC (mut/0h)')+
    facet_grid(. ~ experiment)+
    ggtitle('Obs. diff. levels (timepoints, exp)')+
    theme_classic()
  
  median_states_conc_exp.plot<-states.df %>%
    dplyr::left_join(conc.df %>% dplyr::mutate(timepoint = paste0(treatment, '_vs_0h')) %>% dplyr::select(timepoint, Oct4)) %>%
    dplyr::group_by(experiment, custom_class, Oct4) %>%
    dplyr::summarize(med_deseq = median(2^deseq, na.rm = T), freq = dplyr::n()) %>%
    ggplot(., aes(x = Oct4, y = med_deseq, color = custom_class, group = interaction(custom_class))) +
    geom_point()+
    geom_line()+
    scale_color_manual(name = 'Affinity', values = c('#A11E23', '#E8B4AF', 'black', 'gray'))+
    scale_x_continuous(name = 'Relative Oct4 concentration', trans = 'reverse')+
    scale_y_continuous(name = 'Observed FC (mut/0h)')+
    facet_grid(. ~ experiment)+
    ggtitle('Obs. diff. levels (conc, exp)')+
    theme_classic()

  states.metaplot<-median_states_timepoint.plot + median_states_conc.plot + median_states_timepoint_exp.plot + median_states_conc_exp.plot + plot_layout(ncol = 1)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_diff_metaplots_medians.png'), states.metaplot, height = 16, width = 16)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_diff_metaplots_medians.pdf'), states.metaplot, height = 16, width = 16)
  return(NULL)
})

Differential expression

Observed differential gene expression

#For all motifs, calculate the nearest gene occurrences
#When we subset, we will still take the only Oct4-Sox2 regulated regions that are nearby a gene ONE TIME within 100Kbp
motifs_via_occ.gr<-motifs.df %>% dplyr::filter(motif %in% c('Oct4-Sox2', 'Sox2')) %>%
  dplyr::left_join(., regions.gr %>% as.data.frame %>% dplyr::select(region_id, reduced_region_id, island_content, island_count)) %>%
  makeGRangesFromDataFrame(keep.extra.columns = T, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end')
ttseq_w_deseq.gr<-ttseq_w_deseq.df %>% makeGRangesFromDataFrame(keep.extra.columns = T)

motifs_via_occ.gr$gene_name<-ttseq_w_deseq.gr$gene_name[nearest(motifs_via_occ.gr, ttseq_w_deseq.gr, ignore.strand = T)]
motifs_via_occ.gr$nearest_gene_start<-ttseq_w_deseq.df$start[nearest(motifs_via_occ.gr, ttseq_w_deseq.gr, ignore.strand = T)]
motifs_via_occ.gr$nearest_gene_distance<-abs(motifs_via_occ.gr$nearest_gene_start - start(motifs_via_occ.gr))

motifs_via_occ.gr<-motifs_via_occ.gr %>% plyranges::filter(nearest_gene_distance<=100000)
gene_occurrence.df<-motifs_via_occ.gr %>% as.data.frame %>% 
  dplyr::select(region_id, gene_name) %>% unique %>% 
  dplyr::group_by(gene_name) %>% dplyr::summarize(gene_occurrence = dplyr::n())

ttseq_comparisons<-paste0('deseq_', seq(3, 15, 3), 'h_vs_0h')

Plot NUMBER of target genes that are significantly changed over each timepoint for this motif.

plot.list<-lapply(c('Oct4-Sox2','Sox2'), function(x){
  m.gr<-motifs_via_occ.gr %>% plyranges::filter(motif==x)
  
  if(x=='Oct4-Sox2'){
    m.gr<-m.gr %>% 
      plyranges::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      plyranges::filter(grepl('Oct4-Sox2:1', island_content))
  } else{
    m.gr<-m.gr %>% 
      plyranges::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      plyranges::filter(!grepl('Oct4-Sox2', island_content))
  }
  
  signif_genes.df<-ttseq_w_signif.df %>% 
    dplyr::left_join(., gene_occurrence.df) %>%
    dplyr::mutate(gene_occurrence = ifelse(is.na(gene_occurrence), 0, gene_occurrence)) %>%
    dplyr::filter(gene_occurrence==1) %>%
    dplyr::left_join(m.gr %>% as.data.frame %>% dplyr::select(gene_name, region_id, island_count, island_content, nearest_gene_distance, seq_match_quantile_bin) %>% unique,
                     ., by = 'gene_name') %>%
    dplyr::filter(!is.na(gene_occurrence), nearest_gene_distance<5000) %>%
    dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped'))
  
  #Calculate how many are significant over each timepoint
  signif_genes_count.df<-signif_genes.df %>%
    dplyr::mutate(signif_3h_vs_0h_sign = ifelse(deseq_3h_vs_0h>0, signif_3h_vs_0h, signif_3h_vs_0h*(-1)), #pos is up regulated, neg is down regulated
                  signif_6h_vs_0h_sign = ifelse(deseq_6h_vs_0h>0, signif_6h_vs_0h, signif_6h_vs_0h*(-1)),
                  signif_9h_vs_0h_sign = ifelse(deseq_9h_vs_0h>0, signif_9h_vs_0h, signif_9h_vs_0h*(-1)),
                  signif_12h_vs_0h_sign = ifelse(deseq_12h_vs_0h>0, signif_12h_vs_0h, signif_12h_vs_0h*(-1)),
                  signif_15h_vs_0h_sign = ifelse(deseq_15h_vs_0h>0, signif_15h_vs_0h, signif_15h_vs_0h*(-1))) %>%
    dplyr::select(signif_3h_vs_0h_sign, signif_6h_vs_0h_sign, signif_9h_vs_0h_sign, signif_12h_vs_0h_sign, signif_15h_vs_0h_sign, gene_occurrence, island_state, seq_match_quantile_bin) %>%
    tidyr::pivot_longer(cols = c(signif_3h_vs_0h_sign, signif_6h_vs_0h_sign, signif_9h_vs_0h_sign, signif_12h_vs_0h_sign, signif_15h_vs_0h_sign),
                        names_to = 'timepoint_comparison', values_to = 'padj_signed') %>%
    dplyr::filter(!is.na(padj_signed), abs(padj_signed)<=0.1) %>%
    dplyr::mutate(sign = ifelse(padj_signed>0, 'upreg', 'downreg'),
                  padj = abs(padj_signed)) %>%
    tidyr::separate(timepoint_comparison,into = c(NA, 'timepoint', NA, NA, NA), sep = '_') %>%
    dplyr::mutate(timepoint = timepoint %>% gsub('h', '', .) %>% as.numeric())
  
  ggplot(signif_genes_count.df, aes(x = timepoint, fill = island_state))+
    geom_bar(position = 'dodge')+
    facet_grid(sign ~ seq_match_quantile_bin)+
    ggtitle(x)+
    theme_classic()

})
wrap_plots(plot.list)

Numbers are too few to tell. This analysis is inconclusive, sadly.

Plot magnitude of depletion of nearest target gene.

nearest_gene_count_threshold<-20
grouped.plot<-lapply(c('Oct4-Sox2', 'Sox2'), function(x){
  
  motif.gr<-motifs_via_occ.gr %>% dplyr::filter(motif==x) %>%
    makeGRangesFromDataFrame(keep.extra.columns = T, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end')
  # ttseq_w_deseq.gr<-ttseq_w_deseq.df %>% makeGRangesFromDataFrame(keep.extra.columns = T)
  # 
  # motif.gr$gene_name<-ttseq_w_deseq.gr$gene_name[nearest(motif.gr, ttseq_w_deseq.gr, ignore.strand = T)]
  # motif.gr$nearest_gene_start<-ttseq_w_deseq.df$start[nearest(motif.gr, ttseq_w_deseq.gr, ignore.strand = T)]
  # motif.gr$nearest_gene_distance<-abs(motif.gr$nearest_gene_start - start(motif.gr))
  # 
  # motif.gr<-motif.gr %>% plyranges::filter(nearest_gene_distance<=100000)
  # gene_occurrence.df<-motif.gr %>% as.data.frame %>% dplyr::group_by(gene_name) %>% dplyr::summarize(gene_occurrence = dplyr::n())

  nearest_thresholds<-c(5000, 50000, 100000)
  #Now, replot but with boundaries rather than simple minimums
  plot.list<-lapply(nearest_thresholds, function(thresh){
    message(thresh)
    lower_bound<-max(nearest_thresholds[nearest_thresholds<thresh])
    if(is.infinite(lower_bound)){lower_bound<-0}

    motif_near.gr<-motif.gr %>% plyranges::filter(nearest_gene_distance<=thresh,
                                                  nearest_gene_distance>=lower_bound)

    genes_to_consider.df<-ttseq_w_deseq.gr %>% as.data.frame %>%
      dplyr::left_join(., gene_occurrence.df) %>%
      dplyr::mutate(gene_occurrence = ifelse(is.na(gene_occurrence), 0, gene_occurrence),
                    tteq_at_gene_0h = `X0h_1` + `X0h_2`) %>%
      dplyr::filter(gene_occurrence==1) %>%
      dplyr::left_join(motif_near.gr %>% as.data.frame %>% dplyr::select(gene_name, region_id, island_count, island_content) %>% unique,
                       ., by = 'gene_name') %>%
      dplyr::filter(!is.na(gene_occurrence)) %>%
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped'))

    #Collect islands belonging to each motif
    if(x=='Oct4-Sox2'){
      grouped.df<-genes_to_consider.df %>% dplyr::filter(grepl('Oct4-Sox2:1', island_content))
    } else{
      grouped.df<-genes_to_consider.df %>% dplyr::filter(!grepl('Oct4-Sox2', island_content))
    }
    readr::write_tsv(grouped.df, paste0('tsv/mapped_motifs/nearest_genes_curated_1based_wrt_', x, '_thresh_', as.integer(thresh), '.tsv.gz'))

    #Print numbers
    print(grouped.df %>% dplyr::select(region_id, island_state) %>% unique %>% 
            dplyr::group_by(island_state) %>% dplyr::summarize(counts = dplyr::n()))

    grouped.df<-grouped.df %>% tidyr::pivot_longer(cols = ttseq_comparisons,
                                                   names_to = 'timepoint', values_to = 'deseq') %>%
      dplyr::mutate(timepoint = timepoint %>% factor(., levels = c(ttseq_comparisons), labels = c(ttseq_comparisons %>% gsub('deseq', '', .))))
    

    #Group and summarize and plot motifs by boolean (grouped/ungrouped) and again by affinity group
    group.plot<-grouped.df %>%
      dplyr::group_by(island_state, timepoint) %>%
      dplyr::summarize(med_deseq = median(deseq, na.rm = T),
                       freq = dplyr::n()) %>%
      dplyr::filter(freq>=nearest_gene_count_threshold) %>%
      ggplot(., aes(x = timepoint, y = med_deseq, color = island_state, group = interaction(island_state))) +
      geom_hline(yintercept = 0, linetype = 'dashed')+
      geom_point()+
      geom_line()+
      # facet_grid(gene_freq ~ .)+
      scale_x_discrete(name = 'Timepoint')+
      scale_y_continuous(name = 'Observed Log2FC (mut/0h)', limits = c(-0.2, 0.05))+
      ggtitle(paste0('Gene distance threshold: ', lower_bound, ' <=', thresh))+
      theme_classic()

    return(group.plot)
  })

  grouped_deseq.plot<-wrap_plots(plot.list, ncol = 1)
  grouped_deseq.plot
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_ttseq_between_', x, '.png'), grouped_deseq.plot, height = 15, width = 10)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_ttseq_between_', x, '.pdf'), grouped_deseq.plot, height = 15, width = 10)
})
## # A tibble: 2 × 2
##   island_state counts
##   <chr>         <int>
## 1 grouped         302
## 2 isolated        421
## # A tibble: 2 × 2
##   island_state counts
##   <chr>         <int>
## 1 grouped         637
## 2 isolated        777
## # A tibble: 2 × 2
##   island_state counts
##   <chr>         <int>
## 1 grouped          97
## 2 isolated        164
## # A tibble: 2 × 2
##   island_state counts
##   <chr>         <int>
## 1 grouped         624
## 2 isolated        823
## # A tibble: 2 × 2
##   island_state counts
##   <chr>         <int>
## 1 grouped        1088
## 2 isolated       1401
## # A tibble: 2 × 2
##   island_state counts
##   <chr>         <int>
## 1 grouped         197
## 2 isolated        237

Observed differential enhancers

Next, we want to observe what’s happening for eRNAs across our different Oct4-Sox2 groups.

timepoints<-names(pred.bw.list)
grouped.plot<-lapply(c('Oct4-Sox2', 'Sox2'), function(x){
  message(x)

  #Import motifs
  motif.df<-motifs_w_perturb.df %>% dplyr::filter(motif==x) %>%
    dplyr::arrange(contrib_0h) %>%
    dplyr::mutate(row = 1:dplyr::n())

  #Measure effects of accessibility levels from DESeq2
  ttseq_comparisons<-paste0('ttseq_', timepoints, '_vs_0h')[-1]
  obs.df<-motif.df %>% 
    dplyr::left_join(., regions_w_deseq_both.df %>% dplyr::select(ttseq_comparisons, region_id, atac_obs_q_bin)) %>%
    tidyr::pivot_longer(cols = ttseq_comparisons,
                        names_to = 'timepoint', values_to = 'deseq_ttseq') %>%
    dplyr::mutate(timepoint = timepoint %>% factor(., levels = c(ttseq_comparisons)))

  #Collect islands belonging to each motif
  if(x=='Oct4-Sox2'){
    grouped.df<-obs.df %>% 
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      dplyr::filter(grepl('Oct4-Sox2:1', island_content))
  } else{
    grouped.df<-obs.df %>% 
      dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
      dplyr::filter(!grepl('Oct4-Sox2|pos_2|pos_9', island_content))
  }
  
  grouped.df<-grouped.df %>% dplyr::filter(atac_obs_q_bin>=.6)
  
  #Print numbers
  print(grouped.df %>%
          dplyr::mutate(island_state = ifelse(island_count==1, 'isolated', 'grouped')) %>%
          dplyr::select(island_state, seq_match_quantile_bin, row) %>% unique(.) %>%
          dplyr::group_by(island_state, seq_match_quantile_bin) %>% dplyr::summarize(count = dplyr::n()))
  
  #Do different affinities fall in fundamentally different regions?
  grouped.df %>% dplyr::filter(seq_match_quantile_bin=='smq_2') %>% .$island_content %>% table %>% sort(decreasing = T) %>% head(n=20)
  grouped.df %>% dplyr::filter(seq_match_quantile_bin=='smq_1') %>% .$island_content %>% table %>% sort(decreasing = T) %>% head(n=20)
  
  #Group and summarize and plot motifs by boolean (grouped/ungrouped) and again by affinity group
  grouped_deseq.plot<-grouped.df %>%
    dplyr::group_by(island_state, timepoint) %>%
    dplyr::summarize(med_deseq_ttseq = median(deseq_ttseq, na.rm = T)) %>%
    ggplot(., aes(x = timepoint, y = med_deseq_ttseq, color = island_state, group = interaction(island_state))) +
    # geom_point()+
    geom_line()+
    scale_x_discrete(name = 'Timepoint')+
    scale_y_continuous(name = 'Observed Log2FC (mut/0h)')+
    theme_classic()
  if(x=='Sox2'){grouped_deseq.plot<-grouped_deseq.plot + scale_y_continuous(name = 'Observed Log2FC (mut/0h)', limits = c(-.5, .2))}
  
  affinity_deseq.plot<-grouped.df %>%
    dplyr::group_by(island_state, timepoint, seq_match_quantile_bin) %>%
    dplyr::summarize(med_deseq_ttseq = median(deseq_ttseq, na.rm = T)) %>%
    ggplot(., aes(x = timepoint, y = med_deseq_ttseq, linetype = island_state, color = seq_match_quantile_bin, group = interaction(island_state, seq_match_quantile_bin))) +
    # geom_point()+
    geom_line()+
    scale_x_discrete(name = 'Timepoint')+
    scale_y_continuous(name = 'Observed Log2FC (mut/0h)')+
    theme_classic()
  
  if(x=='Sox2'){affinity_deseq.plot<-affinity_deseq.plot + scale_y_continuous(name = 'Observed Log2FC (mut/0h)', limits = c(-.5, .2))}
  
  #Save plots as a summary plot
  metaplot<-grouped_deseq.plot + affinity_deseq.plot + plot_annotation(title = paste0('Diff. TT-seq at enhancers for ', x))
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_diff_erna_summary.png'), metaplot, height = 3, width = 8)
  ggsave(paste0(figure_filepath, '/motif_role_over_timepoints_', x, '_diff_erna_summary.pdf'), metaplot, height = 3, width = 8)
  
  return(NULL)
})
## # A tibble: 6 × 3
## # Groups:   island_state [2]
##   island_state seq_match_quantile_bin count
##   <chr>        <fct>                  <int>
## 1 grouped      smq_1                   1785
## 2 grouped      smq_2                   1600
## 3 grouped      smq_3                   1552
## 4 isolated     smq_1                   1580
## 5 isolated     smq_2                   1750
## 6 isolated     smq_3                   1852
## # A tibble: 6 × 3
## # Groups:   island_state [2]
##   island_state seq_match_quantile_bin count
##   <chr>        <fct>                  <int>
## 1 grouped      smq_1                   4849
## 2 grouped      smq_2                   4596
## 3 grouped      smq_3                   3774
## 4 isolated     smq_1                   3135
## 5 isolated     smq_2                   2994
## 6 isolated     smq_3                   3971

Export summary of OS motifs

Julia requested this spreadsheet at one point so we could mull over all the results and make sure our interpretations are correct.

Collect expression values.

atac_comparisons<-paste0('deseq_', c('3h','6h','9h','12h','15h'), '_vs_0h')
ttseq_comparisons<-paste0('ttseq_', c('3h','6h','9h','12h','15h'), '_vs_0h')
gene_comparisons<-paste0('deseq_', c('3h','6h','9h','12h','15h'), '_vs_0h')

gene_expr.df<-ttseq_w_deseq.df %>% dplyr::select(gene_name, gene_comparisons)
colnames(gene_expr.df)<-colnames(gene_expr.df) %>% gsub('deseq_', 'deseq_gene_', .)

local_expr.df<-regions_w_deseq_both.df %>% dplyr::select(region_id, atac_obs, atac_obs_q, atac_comparisons, ttseq_comparisons)
colnames(local_expr.df)<-colnames(local_expr.df) %>% gsub('deseq_', 'deseq_acc_', .)
colnames(local_expr.df)<-colnames(local_expr.df) %>% gsub('ttseq_', 'deseq_ttseq_', .)

Recalculate motif distances with unlimited lengths.

#Collect motifs with no limits on nearest gene start
motifs_via_occ.gr<-motifs.df %>% dplyr::filter(motif %in% c('Oct4-Sox2', 'Sox2')) %>%
  dplyr::left_join(., regions.gr %>% as.data.frame %>% dplyr::select(region_id, reduced_region_id, island_content, island_count)) %>%
  makeGRangesFromDataFrame(keep.extra.columns = T, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end')
ttseq_w_deseq.gr<-ttseq_w_deseq.df %>% makeGRangesFromDataFrame(keep.extra.columns = T)

motifs_via_occ.gr$gene_name<-ttseq_w_deseq.gr$gene_name[nearest(motifs_via_occ.gr, ttseq_w_deseq.gr, ignore.strand = T)]
motifs_via_occ.gr$nearest_gene_start<-ttseq_w_deseq.df$start[nearest(motifs_via_occ.gr, ttseq_w_deseq.gr, ignore.strand = T)]
motifs_via_occ.gr$nearest_gene_distance<-abs(motifs_via_occ.gr$nearest_gene_start - start(motifs_via_occ.gr))

Figure out which motifs are mapped across each timepoint.

p.list<-os_patterns.list[['Oct4-Sox2']]
mapped.df<-mclapply(names(p.list), function(y){
  hits.gr<-readr::read_tsv(paste0('modiscolite/', y, '_fold1_atac_counts/hits.tsv')) %>% 
    makeGRangesFromDataFrame(keep.extra.columns = T, starts.in.df.are.0based = T) %>%
    dplyr::filter(pattern_name %in% p.list[[y]], metacluster_name=='pos_patterns')
  overlapsAny(motifs_w_perturb.df %>% makeGRangesFromDataFrame(keep.extra.columns = F, starts.in.df.are.0based = F, seqnames.field = 'seqnames', start.field = 'start', end.field = 'end'), hits.gr, ignore.strand = T)
}, mc.cores = 2) %>% as.data.frame()
colnames(mapped.df)<-names(p.list) %>% gsub('atac_', 'mapped_by_', .)

Collect and export motifs.

collected_motifs.df<-motifs_w_perturb.df %>% dplyr::select(seqnames, start, end, motif, motif_id, region_id, seq_match_quantile, 
                                      paste0('contrib_', names(shap.bw.list)), paste0('marg_', names(shap.bw.list)), paste0('perturb_', names(shap.bw.list)),
                                      island_content) %>%
  cbind(., mapped.df) %>%
  dplyr::left_join(., local_expr.df) %>%
  dplyr::left_join(., motifs_via_occ.gr %>% as.data.frame %>% dplyr::select(motif_id, nearest_gene_start, nearest_gene_distance, gene_name)) %>%
  dplyr::left_join(., gene_occurrence.df) %>%
  dplyr::left_join(., gene_expr.df %>% dplyr::group_by(gene_name) %>% dplyr::slice_max(deseq_gene_12h_vs_0h, with_ties = F)) %>%
  dplyr::left_join(., regions_w_deseq_both.df %>% dplyr::select(region_id, atac_obs_q_bin)) %>%
  dplyr::filter(motif=='Oct4-Sox2') %>%
  dplyr::filter(atac_obs_q_bin>=.6)

writexl::write_xlsx(collected_motifs.df, 'misc/formatted_Oct4Sox2_motifs_over_concentration_depletion.xlsx')

Assess overall accessibility wrt context

local_expr.df<-regions_w_deseq_both.df %>% dplyr::select(region_id, atac_obs, atac_obs_q, atac_comparisons, ttseq_comparisons)
colnames(local_expr.df)<-colnames(local_expr.df) %>% gsub('deseq_', 'deseq_acc_', .)
colnames(local_expr.df)<-colnames(local_expr.df) %>% gsub('ttseq_', 'deseq_ttseq_', .)

motifs_w_metadata.df<-motifs_w_perturb.df %>% dplyr::select(seqnames, start, end, motif, motif_id, region_id, seq_match_quantile, 
                                      paste0('contrib_', names(shap.bw.list)), paste0('marg_', names(shap.bw.list)), paste0('perturb_', names(shap.bw.list)),
                                      island_content) %>%
  dplyr::left_join(., motifs_via_occ.gr %>% as.data.frame %>% dplyr::select(motif_id, nearest_gene_start, nearest_gene_distance, gene_name)) %>%
  dplyr::left_join(., local_expr.df) %>%
  dplyr::left_join(., regions_w_atac.df %>% dplyr::select(region_id, paste0(seq(0, 15, 3), 'h'), CpG_ratio)) %>%
  dplyr::filter(motif %in% c('Oct4-Sox2','Sox2')) %>% 
  dplyr::mutate(cpg_island = CpG_ratio>.6) %>%
  dplyr::left_join(., regions_w_deseq_both.df %>% dplyr::select(region_id, atac_obs_q_bin)) %>%
  dplyr::filter(atac_obs_q_bin>=.6)

context_vs_acc_0h.plot<-ggplot(motifs_w_metadata.df, aes(x = perturb_0h, y = log(`0h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc perturb. log(WT/dA)')+
  scale_y_continuous(name = 'Acc obs (log)')+
  facet_grid(cpg_island~motif, drop = T)+
  ggtitle('atac_0h')+
  theme_classic() + theme(legend.position = 'bottom')

context_vs_acc_9h.plot<-ggplot(motifs_w_metadata.df, aes(x = perturb_9h, y = log(`9h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc perturb. log(WT/dA)')+
  scale_y_continuous(name = 'Acc obs (log)')+
  facet_grid(cpg_island~motif, drop = T)+
  ggtitle('atac_9h')+
  theme_classic() + theme(legend.position = 'bottom')

context_vs_acc_15h.plot<-ggplot(motifs_w_metadata.df, aes(x = perturb_15h, y = log(`15h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc perturb. log(WT/dA)')+
  scale_y_continuous(name = 'Acc obs (log)')+
  facet_grid(cpg_island~motif, drop = T)+
  ggtitle('atac_15h')+
  theme_classic() + theme(legend.position = 'bottom')

context_vs_acc.plot<-context_vs_acc_0h.plot + context_vs_acc_9h.plot + context_vs_acc_15h.plot + plot_layout(ncol = 1)
ggsave(paste0(figure_filepath, '/acc_perturb_vs_acc_obs_vs_ic_scatter.png'), context_vs_acc.plot, height = 20, width = 8)
ggsave(paste0(figure_filepath, '/acc_perturb_vs_acc_obs_vs_ic_scatter.pdf'), context_vs_acc.plot, height = 20, width = 8)
isolation_vs_acc_0h.plot<-ggplot(motifs_w_metadata.df, aes(x = marg_0h, y = log(`0h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc marg. log(inj/mut)')+
  scale_y_continuous(name = 'Acc obs (log)')+
  facet_grid(cpg_island~motif, drop = T)+
  ggtitle('atac_0h')+
  theme_classic() + theme(legend.position = 'bottom')

isolation_vs_acc_9h.plot<-ggplot(motifs_w_metadata.df, aes(x = marg_9h, y = log(`9h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc marg. log(inj/mut)')+
  scale_y_continuous(name = 'Acc obs (log)')+
  facet_grid(cpg_island~motif, drop = T)+
  ggtitle('atac_9h')+
  theme_classic() + theme(legend.position = 'bottom')

isolation_vs_acc_15h.plot<-ggplot(motifs_w_metadata.df, aes(x = marg_15h, y = log(`15h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc marg. log(inj/mut)')+
  scale_y_continuous(name = 'Acc obs (log)')+
  facet_grid(cpg_island~motif, drop = T)+
  ggtitle('atac_15h')+
  theme_classic() + theme(legend.position = 'bottom')

isolation_vs_acc.plot<-isolation_vs_acc_0h.plot + isolation_vs_acc_9h.plot + isolation_vs_acc_15h.plot + plot_layout(ncol = 1)
ggsave(paste0(figure_filepath, '/acc_marg_vs_acc_obs_vs_ic_scatter.png'), isolation_vs_acc.plot, height = 20, width = 8)
ggsave(paste0(figure_filepath, '/acc_marg_vs_acc_obs_vs_ic_scatter.pdf'), isolation_vs_acc.plot, height = 20, width = 8)

Examine context - isolation scores

coopscore_vs_acc_0h.plot<-ggplot(motifs_w_metadata.df, aes(x = perturb_0h - marg_0h, y = log(`0h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc coopscore. perturb - marg')+
  scale_y_continuous(name = 'Acc obs (log)')+
  facet_grid(cpg_island~motif, drop = T)+
  ggtitle('atac_0h')+
  theme_classic() + theme(legend.position = 'bottom')

coopscore_vs_acc_9h.plot<-ggplot(motifs_w_metadata.df, aes(x = perturb_9h - marg_9h, y = log(`9h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc coopscore. perturb - marg')+
  scale_y_continuous(name = 'Acc obs (log)')+
  facet_grid(cpg_island~motif, drop = T)+
  ggtitle('atac_9h')+
  theme_classic() + theme(legend.position = 'bottom')

coopscore_vs_acc_15h.plot<-ggplot(motifs_w_metadata.df, aes(x = perturb_15h - marg_15h, y = log(`15h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc coopscore. perturb - marg')+
  scale_y_continuous(name = 'Acc obs (log)')+
  facet_grid(cpg_island~motif, drop = T)+
  ggtitle('atac_15h')+
  theme_classic() + theme(legend.position = 'bottom')

coopscore_vs_acc.plot<-coopscore_vs_acc_0h.plot + coopscore_vs_acc_9h.plot + coopscore_vs_acc_15h.plot + plot_layout(ncol = 1)
ggsave(paste0(figure_filepath, '/acc_coopscore_vs_acc_obs_vs_ic_scatter.png'), coopscore_vs_acc.plot, height = 20, width = 8)
ggsave(paste0(figure_filepath, '/acc_coopscore_vs_acc_obs_vs_ic_scatter.pdf'), coopscore_vs_acc.plot, height = 20, width = 8)

Examine cooperativity over time course.

all_perturbs.df<-readr::read_tsv('tsv/mapped_motifs/all_pairs_curated_0based_with_cooperativity.tsv.gz') %>%
  dplyr::mutate(motif_pair_distance = abs(pattern_center.y - pattern_center.x)) %>%
  dplyr::rowwise() %>%
  dplyr::mutate(motif_pair_ordered = paste(sort(c(motif.x, motif.y)), collapse = " vs "),
                motif_pair_blind = paste0(motif.x, ' vs ', motif.y)) %>% 
  #Remove directionality from pairs by removing one of A->B and B->A pairs
  # dplyr::filter(motif_pair_ordered==motif_pair_blind) %>% 
  dplyr::ungroup()

models.list<-list(bpnet_osknz = c('oct4','sox2','nanog','klf4','zic3'),
                  atac_wt = c('atac'),
                  atac_0h = c('atac'),
                  atac_3h = c('atac'),
                  atac_6h = c('atac'),
                  atac_9h = c('atac'),
                  atac_12h = c('atac'),
                  atac_15h = c('atac'))

models_of_interest<-c('atac_wt','atac_0h','atac_9h','atac_15h')

perturbs.df<-mclapply(models_of_interest, function(x){
  p.df<-lapply(models.list[[x]], function(task){
    all_perturbs.df %>% 
      dplyr::select(region_id, pair_name, pair_id, motif_pair_ordered, motif_pair_blind, motif_pair_distance, 
                    motif.x, motif.y, seq_match_quantile.x, seq_match_quantile.y,
                    hAB_sum = !!sym(paste0(x, '/', task, '/hAB/pred_sum')),
                    hB_sum = !!sym(paste0(x, '/', task, '/hB/pred_sum')),
                    hA_sum = !!sym(paste0(x, '/', task, '/hA/pred_sum')),
                    null_sum = !!sym(paste0(x, '/', task, '/null/pred_sum'))) %>%
      dplyr::mutate(model_name = x,
                    task = task,
                    distance_class = cut(motif_pair_distance, 
                                         breaks = distance_class_breaks, 
                                         labels = distance_class_labels, 
                                         include.lowest = T),
                    add_coop = (hAB_sum - null_sum)/(hA_sum + hB_sum - 2*null_sum),
                    dir_coop = (hAB_sum - (hB_sum - null_sum))/(hA_sum),
                    mult_coop = log(hAB_sum/null_sum)/log((hA_sum*hB_sum)/(null_sum*null_sum)),
                    joint_effects = hAB_sum - null_sum,
                    marginal_effects = hA_sum + hB_sum - 2*null_sum,
                  A_effects = hA_sum - null_sum,
                  B_effects = hB_sum - null_sum)
  }) %>% rbindlist(.)
}, mc.cores = 6) %>% rbindlist(.) %>%
  dplyr::filter(motif_pair_distance<=400)

pairs_w_metadata.df<-perturbs.df %>%
  dplyr::left_join(., local_expr.df) %>%
  dplyr::left_join(., regions_w_atac.df %>% dplyr::select(region_id, paste0(seq(0, 15, 3), 'h'))) %>%
  dplyr::filter(motif.x %in% c('Oct4-Sox2','Sox2'), motif.y %in% c('Oct4-Sox2','Sox2'))


coop_vs_acc_0h.plot<-ggplot(pairs_w_metadata.df %>% dplyr::filter(model_name=='atac_0h'), aes(x = log(add_coop), y = log(`0h`)))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'Coop', limits = c(-4, 4))+
  scale_y_continuous(name = 'Acc obs (log)')+
  facet_grid(~motif_pair_ordered, drop = T)+
  ggtitle('atac_0h')+
  theme_classic() + theme(legend.position = 'bottom')

coop_vs_acc_9h.plot<-ggplot(pairs_w_metadata.df %>% dplyr::filter(model_name=='atac_9h'), aes(x = log(add_coop), y = log(`9h`)))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'Coop', limits = c(-4, 4))+
  scale_y_continuous(name = 'Acc obs (log)')+
  facet_grid(~motif_pair_ordered, drop = T)+
  ggtitle('atac_0h')+
  theme_classic() + theme(legend.position = 'bottom')

coop_vs_acc_15h.plot<-ggplot(pairs_w_metadata.df %>% dplyr::filter(model_name=='atac_15h'), aes(x = log(add_coop), y = log(`15h`)))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'Coop', limits = c(-4, 4))+
  scale_y_continuous(name = 'Acc obs (log)')+
  facet_grid(~motif_pair_ordered, drop = T)+
  ggtitle('atac_0h')+
  theme_classic() + theme(legend.position = 'bottom')

coop_vs_acc.plot<-coop_vs_acc_0h.plot + coop_vs_acc_9h.plot + coop_vs_acc_15h.plot + plot_layout(ncol = 1)
ggsave(paste0(figure_filepath, '/acc_coop_vs_acc_obs_vs_ic_scatter.png'), coop_vs_acc.plot, height = 12, width = 8)
ggsave(paste0(figure_filepath, '/acc_coop_vs_acc_obs_vs_ic_scatter.pdf'), coop_vs_acc.plot, height = 12, width = 8)

Assess predicted accessibility wrt context

# local_expr.df<-regions_w_deseq_both.df %>% dplyr::select(region_id, atac_pred, atac_pred_q, atac_comparisons, ttseq_comparisons)
# colnames(local_expr.df)<-colnames(local_expr.df) %>% gsub('deseq_', 'deseq_acc_', .)
# colnames(local_expr.df)<-colnames(local_expr.df) %>% gsub('ttseq_', 'deseq_ttseq_', .)

motifs_w_metadata.df<-motifs_w_perturb.df %>% dplyr::select(seqnames, start, end, motif, motif_id, region_id, seq_match_quantile, 
                                      paste0('contrib_', names(shap.bw.list)), paste0('marg_', names(shap.bw.list)), paste0('perturb_', names(shap.bw.list)),
                                      island_content) %>%
  dplyr::left_join(., regions_w_pred_atac.df %>% dplyr::select(region_id, paste0(seq(0, 15, 3), 'h'), CpG_ratio)) %>%
  dplyr::filter(motif %in% c('Oct4-Sox2','Sox2')) %>% 
  dplyr::mutate(cpg_island = CpG_ratio>.6) %>%
  dplyr::left_join(., regions_w_deseq_both.df %>% dplyr::select(region_id, atac_obs_q_bin)) %>%
  dplyr::filter(atac_obs_q_bin>=.6)

context_vs_acc_0h.plot<-ggplot(motifs_w_metadata.df, aes(x = perturb_0h, y = log(`0h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc perturb. log(WT/dA)')+
  scale_y_continuous(name = 'Acc pred (log)')+
  facet_grid(.~motif, drop = T)+
  ggtitle('atac_0h')+
  theme_classic() + theme(legend.position = 'bottom')

context_vs_acc_9h.plot<-ggplot(motifs_w_metadata.df, aes(x = perturb_9h, y = log(`9h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc perturb. log(WT/dA)')+
  scale_y_continuous(name = 'Acc pred (log)')+
  facet_grid(.~motif, drop = T)+
  ggtitle('atac_9h')+
  theme_classic() + theme(legend.position = 'bottom')

context_vs_acc_15h.plot<-ggplot(motifs_w_metadata.df, aes(x = perturb_15h, y = log(`15h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc perturb. log(WT/dA)')+
  scale_y_continuous(name = 'Acc pred (log)')+
  facet_grid(.~motif, drop = T)+
  ggtitle('atac_15h')+
  theme_classic() + theme(legend.position = 'bottom')

context_vs_acc.plot<-context_vs_acc_0h.plot + context_vs_acc_9h.plot + context_vs_acc_15h.plot + plot_layout(ncol = 1)
ggsave(paste0(figure_filepath, '/acc_perturb_vs_acc_pred_vs_ic_scatter.png'), context_vs_acc.plot, height = 12, width = 8)
ggsave(paste0(figure_filepath, '/acc_perturb_vs_acc_pred_vs_ic_scatter.pdf'), context_vs_acc.plot, height = 12, width = 8)

Examine context - isolation scores

coopscore_vs_acc_0h.plot<-ggplot(motifs_w_metadata.df, aes(x = perturb_0h - marg_0h, y = log(`0h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc coopscore. perturb - marg')+
  scale_y_continuous(name = 'Acc pred (log)')+
  facet_grid(.~motif, drop = T)+
  ggtitle('atac_0h')+
  theme_classic() + theme(legend.position = 'bottom')

coopscore_vs_acc_9h.plot<-ggplot(motifs_w_metadata.df, aes(x = perturb_9h - marg_9h, y = log(`9h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc coopscore. perturb - marg')+
  scale_y_continuous(name = 'Acc pred (log)')+
  facet_grid(.~motif, drop = T)+
  ggtitle('atac_9h')+
  theme_classic() + theme(legend.position = 'bottom')

coopscore_vs_acc_15h.plot<-ggplot(motifs_w_metadata.df, aes(x = perturb_15h - marg_15h, y = log(`15h`), color = seq_match_quantile))+
  ggrastr::geom_point_rast(size = .1)+
  ggpubr::stat_cor(method = 'spearman', label.x.npc = 'left', label.y.npc = 'top')+
  scale_color_gradientn(colours = viridis(5), name = 'PWM score %ile')+
  scale_x_continuous(name = 'acc coopscore. perturb - marg')+
  scale_y_continuous(name = 'Acc pred (log)')+
  facet_grid(.~motif, drop = T)+
  ggtitle('atac_15h')+
  theme_classic() + theme(legend.position = 'bottom')

coopscore_vs_acc.plot<-coopscore_vs_acc_0h.plot + coopscore_vs_acc_9h.plot + coopscore_vs_acc_15h.plot + plot_layout(ncol = 1)
ggsave(paste0(figure_filepath, '/acc_coopscore_vs_acc_pred_vs_ic_scatter.png'), coopscore_vs_acc.plot, height = 12, width = 8)
ggsave(paste0(figure_filepath, '/acc_coopscore_vs_acc_pred_vs_ic_scatter.pdf'), coopscore_vs_acc.plot, height = 12, width = 8)

Assess summarized acc vs context

obs_vs_context_summary.df<-motifs_w_perturb.df %>% dplyr::select(seqnames, start, end, motif, motif_id, region_id, seq_match_quantile, 
                                      paste0('contrib_', names(shap.bw.list)), paste0('marg_', names(shap.bw.list)), paste0('perturb_', names(shap.bw.list)),
                                      island_content) %>%
  dplyr::left_join(., regions_w_atac.df %>% dplyr::select(region_id, paste0(seq(0, 15, 3), 'h'), CpG_ratio) %>% dplyr::mutate(`0h_q` = ecdf(`0h`)(`0h`))) %>%
  dplyr::mutate(`0h_q_bin` = plyr::round_any(`0h_q`, .1, ceiling)) %>% 
  dplyr::filter(motif %in% c('Oct4-Sox2','Sox2')) %>%
  dplyr::group_by(motif, `0h_q_bin`, seq_match_quantile) %>%
  dplyr::mutate(`perturb_0h_med` = median(perturb_0h),
                `perturb_9h_med` = median(perturb_9h),
                `perturb_15h_med` = median(perturb_15h),
                `coopscore_0h_med` = median(perturb_0h - marg_0h),
                `coopscore_9h_med` = median(perturb_9h - marg_9h),
                `coopscore_15h_med` = median(perturb_15h - marg_15h),
                med_seq_match = median(seq_match_quantile)) %>%
  dplyr::filter(`0h_q_bin`>=.6)

coopscore_vs_acc_0h_bin.plot<-ggplot(obs_vs_context_summary.df, 
                                     aes(x = `0h_q_bin`, y = coopscore_0h_med, fill = med_seq_match, 
                                         group = `0h_q_bin`))+
  geom_boxplot()+
  scale_color_gradient(high = 'purple', low = 'white', name = 'Med. PWM score %ile')+
  scale_y_continuous(name = 'Acc coopscore. perturb - marg')+
  scale_x_continuous(name = 'Acc obs quantile bin')+
  facet_grid(.~motif, drop = T)+
  ggtitle('ATAC (0h)')+
  theme_classic() + theme(legend.position = 'bottom')
coopscore_vs_acc_0h_bin.plot

perturb_vs_acc_0h_bin.plot<-ggplot(obs_vs_context_summary.df, 
                                     aes(x = `0h_q_bin`, y = perturb_0h_med, fill = med_seq_match, 
                                         group = `0h_q_bin`))+
  geom_boxplot()+
  scale_color_gradient(high = 'purple', low = 'white', name = 'Med. PWM score %ile')+
  scale_y_continuous(name = 'Acc context')+
  scale_x_continuous(name = 'Acc obs quantile bin')+
  facet_grid(.~motif, drop = T)+
  ggtitle('ATAC (0h)')+
  theme_classic() + theme(legend.position = 'bottom')
perturb_vs_acc_0h_bin.plot

ggsave(paste0(figure_filepath, '/acc_perturb_vs_acc_obs_summary.png'), perturb_vs_acc_0h_bin.plot, height = 4, width = 8)
ggsave(paste0(figure_filepath, '/acc_perturb_vs_acc_obs_summary.pdf'), perturb_vs_acc_0h_bin.plot, height = 4, width = 8)
pred_vs_context_summary.df<-motifs_w_perturb.df %>% dplyr::select(seqnames, start, end, motif, motif_id, region_id, seq_match_quantile, 
                                      paste0('contrib_', names(shap.bw.list)), paste0('marg_', names(shap.bw.list)), paste0('perturb_', names(shap.bw.list)),
                                      island_content) %>%
  dplyr::left_join(., regions_w_pred_atac.df %>% dplyr::select(region_id, paste0(seq(0, 15, 3), 'h'), CpG_ratio) %>% dplyr::mutate(`0h_q` = ecdf(`0h`)(`0h`))) %>%
  dplyr::mutate(`0h_q_bin` = plyr::round_any(`0h_q`, .1, ceiling)) %>% 
  dplyr::filter(motif %in% c('Oct4-Sox2','Sox2')) %>%
  dplyr::group_by(motif, `0h_q_bin`, seq_match_quantile) %>%
  dplyr::mutate(`perturb_0h_med` = median(perturb_0h),
                `perturb_9h_med` = median(perturb_9h),
                `perturb_15h_med` = median(perturb_15h),
                `coopscore_0h_med` = median(perturb_0h - marg_0h),
                `coopscore_9h_med` = median(perturb_9h - marg_9h),
                `coopscore_15h_med` = median(perturb_15h - marg_15h),
                med_seq_match = median(seq_match_quantile)) %>%
  dplyr::filter(`0h_q_bin`>=.6)

coopscore_vs_acc_0h_bin.plot<-ggplot(pred_vs_context_summary.df, 
                                     aes(x = `0h_q_bin`, y = coopscore_0h_med, fill = med_seq_match, 
                                         group = `0h_q_bin`))+
  geom_boxplot()+
  scale_color_gradient(high = 'purple', low = 'white', name = 'Med. PWM score %ile')+
  scale_y_continuous(name = 'Acc coopscore. perturb - marg')+
  scale_x_continuous(name = 'Acc pred quantile bin')+
  facet_grid(.~motif, drop = T)+
  ggtitle('ATAC (0h)')+
  theme_classic() + theme(legend.position = 'bottom')
coopscore_vs_acc_0h_bin.plot

perturb_vs_acc_0h_bin.plot<-ggplot(pred_vs_context_summary.df, 
                                     aes(x = `0h_q_bin`, y = perturb_0h_med, fill = med_seq_match, 
                                         group = `0h_q_bin`))+
  geom_boxplot()+
  scale_color_gradient(high = 'purple', low = 'white', name = 'Med. PWM score %ile')+
  scale_y_continuous(name = 'Acc context')+
  scale_x_continuous(name = 'Acc pred quantile bin')+
  facet_grid(.~motif, drop = T)+
  ggtitle('ATAC (0h)')+
  theme_classic() + theme(legend.position = 'bottom')
perturb_vs_acc_0h_bin.plot

ggsave(paste0(figure_filepath, '/acc_perturb_vs_acc_pred_summary.png'), perturb_vs_acc_0h_bin.plot, height = 4, width = 8)
ggsave(paste0(figure_filepath, '/acc_perturb_vs_acc_pred_summary.pdf'), perturb_vs_acc_0h_bin.plot, height = 4, width = 8)

Summarize isolation/context by mapped/unmapped over timepoints

Barplots to show median context/isolation scores a la F1 based on whether they are mapped/unmapped by timepoint. Alongside this, show local accessibility effects.

median_m.df<-lapply(paste0(seq(0, 15, 3), 'h'), function(x){
  collected_motifs.df %>% dplyr::filter(!!sym(paste0('mapped_by_', x))) %>% dplyr::group_by(motif) %>%
  dplyr::summarize(med_perturb = median(!!sym(paste0('perturb_', x))),
                   med_marg = median(!!sym(paste0('marg_', x))),
                   med_deseq_acc_15h_vs_0h = median(deseq_acc_15h_vs_0h)) %>%
    dplyr::mutate(timepoint = x)
}) %>% rbindlist() %>% 
  dplyr::mutate(timepoint = factor(timepoint, paste0(seq(0, 15, 3), 'h')))

interp.plot<-median_m.df %>%
  tidyr::pivot_longer(cols = c('med_perturb', 'med_marg'), names_to = 'type', values_to = 'mut_effect') %>%
  ggplot(., aes(x = timepoint, y = mut_effect, fill = type))+
  geom_bar(stat = 'identity', position = 'dodge', color = 'black')+
  scale_y_continuous(name = 'Median Context or isolation scores')+
  scale_x_discrete(name = 'Motifs mapped by timepoint')+
  scale_fill_manual(values = c('#fdb863', '#8073ac'))+
  ggtitle('Predicted effects')+
  theme_classic() + theme(legend.position = 'bottom')
rel_interp.plot<-median_m.df %>%
  dplyr::mutate(coop_score = log(med_perturb/med_marg)) %>%
  ggplot(., aes(x = timepoint, y = coop_score))+
  geom_bar(stat = 'identity', position = 'dodge', fill = '#4575b4', color = 'black')+
  scale_y_continuous(name = 'log(context/isolation)')+
  scale_x_discrete(name = 'Motifs mapped by timepoint')+
  ggtitle('Relative cooperativity boost')+
  theme_classic() + theme(legend.position = 'bottom')
abs_interp.plot<-median_m.df %>%
  dplyr::mutate(coop_score = med_perturb - med_marg) %>%
  ggplot(., aes(x = timepoint, y = coop_score))+
  geom_bar(stat = 'identity', position = 'dodge', fill = 'navy', color = 'black')+
  scale_y_continuous(name = 'context - isolation')+
  scale_x_discrete(name = 'Motifs mapped by timepoint')+
  ggtitle('Absolute cooperativity boost')+
  theme_classic() + theme(legend.position = 'bottom')
acc.plot<-ggplot(median_m.df, aes(x = timepoint, y = -1 * med_deseq_acc_15h_vs_0h))+
  geom_bar(stat = 'identity', position = 'dodge', fill = 'black')+
  scale_y_continuous(name = 'Median diff DESeq2 acc (wt/mut)')+
  scale_x_discrete(name = 'Motifs mapped by timepoint')+
  ggtitle('Observed acc. effects')+
  theme_classic()
plot<-interp.plot + rel_interp.plot + abs_interp.plot + acc.plot + plot_layout(nrow = 1, widths = c(2, 1, 1, 1))
plot

ggsave(paste0(figure_filepath, '/interp_by_mapped.png'), plot, height = 4, width = 12)
ggsave(paste0(figure_filepath, '/interp_by_mapped.pdf'), plot, height = 4, width = 12)

Further, statistically validate that context scores (versus PWM scores or isolation scores) are the most predictive of whether a motif will be mapped or not.

interp_vs_mapping_longevity.df<-collected_motifs.df %>% tidyr::pivot_longer(cols = paste0('mapped_by_', seq(0, 12, 3), 'h'), names_to = 'timepoint', values_to = 'mapped_status') %>%
  dplyr::select(perturb_0h, marg_0h, seq_match_quantile, motif_id, timepoint, mapped_status) %>%
  dplyr::mutate(timepoint = gsub('mapped_by_', '', timepoint) %>% gsub('h', '', .) %>% as.numeric()) %>%
  dplyr::filter(mapped_status) %>%
  dplyr::group_by(motif_id) %>% 
  dplyr::slice_max(timepoint, n=1, with_ties = F, na_rm = T)

Test residuals explaining which one causes longest mapped motif.

model<-lm(formula = timepoint ~ seq_match_quantile + perturb_0h + marg_0h, data =  interp_vs_mapping_longevity.df)
af <- anova(model)
afss <- af$"Sum Sq"
df<-cbind(af,var_explained=afss/sum(afss)*100)
df$comparison<-rownames(df)
df
##                       Df    Sum Sq      Mean Sq   F value       Pr(>F)
## seq_match_quantile     1 44107.667 44107.666720 6455.2315 0.000000e+00
## perturb_0h             1 41422.861 41422.860591 6062.3056 0.000000e+00
## marg_0h                1   708.537   708.536965  103.6956 2.922954e-24
## Residuals          12717 86893.428     6.832856        NA           NA
##                    var_explained         comparison
## seq_match_quantile    25.4762500 seq_match_quantile
## perturb_0h            23.9255266         perturb_0h
## marg_0h                0.4092455            marg_0h
## Residuals             50.1889779          Residuals
model<-lm(formula = timepoint ~ seq_match_quantile, data =  interp_vs_mapping_longevity.df)
af <- anova(model)
afss <- af$"Sum Sq"
seq.df<-cbind(af,var_explained=afss/sum(afss)*100)
seq.df$comparison<-rownames(seq.df)
seq.df
##                       Df    Sum Sq     Mean Sq  F value Pr(>F) var_explained
## seq_match_quantile     1  44107.67 44107.66672 4348.042      0      25.47625
## Residuals          12719 129024.83    10.14426       NA     NA      74.52375
##                            comparison
## seq_match_quantile seq_match_quantile
## Residuals                   Residuals
model<-lm(formula = timepoint ~ perturb_0h, data =  interp_vs_mapping_longevity.df)
af <- anova(model)
afss <- af$"Sum Sq"
perturb.df<-cbind(af,var_explained=afss/sum(afss)*100)
perturb.df$comparison<-rownames(perturb.df)
perturb.df
##               Df   Sum Sq      Mean Sq F value Pr(>F) var_explained comparison
## perturb_0h     1 75762.88 75762.881144  9896.6      0      43.76006 perturb_0h
## Residuals  12719 97369.61     7.655446      NA     NA      56.23994  Residuals
model<-lm(formula = timepoint ~ marg_0h, data =  interp_vs_mapping_longevity.df)
af <- anova(model)
afss <- af$"Sum Sq"
marg.df<-cbind(af,var_explained=afss/sum(afss)*100)
marg.df$comparison<-rownames(marg.df)
marg.df
##              Df    Sum Sq      Mean Sq  F value Pr(>F) var_explained comparison
## marg_0h       1  48861.43 48861.432647 5000.911      0      28.22199    marg_0h
## Residuals 12719 124271.06     9.770506       NA     NA      71.77801  Residuals

Quantify levels that are explained by each feature.

plot<-rbind(seq.df %>% dplyr::mutate(type = 'seq'), marg.df %>% dplyr::mutate(type = 'marg'), perturb.df %>% dplyr::mutate(type = 'perturb')) %>%
  dplyr::filter(comparison!='Residuals') %>%
  ggplot(., aes(x = type, fill = type, y = var_explained))+
  geom_bar(stat = 'identity', position = 'dodge')+
  theme_classic() + theme(legend.position = 'bottom')
plot

ggsave(paste0(figure_filepath, '/perturb_vs_pwm_and_mapping_frequency.png'), plot, height = 2.5, width = 6.5)
ggsave(paste0(figure_filepath, '/perturb_vs_pwm_and_mapping_frequency.pdf'), plot, height = 2.5, width = 6.5)

Barplots to show median context/isolation scores a la F1 based on whether they are mapped/unmapped by timepoint. Alongside this, show local accessibility effects. However, resummarize by looking at EACH timepoint by filtering for motifs maintained so its the “same” set of motifs per plot

plot_all<-lapply(paste0(seq(0, 12, 3), 'h'), function(y){
  
  median_m.df<-lapply(paste0(seq(0, 15, 3), 'h'), function(x){
    collected_motifs.df %>% dplyr::mutate(deseq_acc_0h_vs_0h = 0) %>% dplyr::filter(!!sym(paste0('mapped_by_', y))) %>% dplyr::group_by(motif) %>%
      dplyr::summarize(med_perturb = median(!!sym(paste0('perturb_', x))),
                       med_marg = median(!!sym(paste0('marg_', x))),
                       med_deseq_acc_mut_vs_0h = median(!!sym(paste0('deseq_acc_', x, '_vs_0h')))) %>%
      dplyr::mutate(timepoint = x)
  }) %>% rbindlist() %>% 
    dplyr::mutate(timepoint = factor(timepoint, paste0(seq(0, 15, 3), 'h')))
  
  interp.plot<-median_m.df %>%
    tidyr::pivot_longer(cols = c('med_perturb', 'med_marg'), names_to = 'type', values_to = 'mut_effect') %>%
    ggplot(., aes(x = timepoint, y = mut_effect, fill = type))+
    geom_bar(stat = 'identity', position = 'dodge', color = 'black')+
    scale_y_continuous(name = 'Median Context or isolation scores', limits = c(0, .65))+
    scale_x_discrete(name = 'Motifs mapped by timepoint')+
    scale_fill_manual(values = c('#fdb863', '#8073ac'))+
    ggtitle('Predicted effects', subtitle = paste0('Motifs mapped only from ', y))+
    theme_classic() + theme(legend.position = 'bottom')
  rel_interp.plot<-median_m.df %>%
    dplyr::mutate(coop_score = log(med_perturb/med_marg)) %>%
    ggplot(., aes(x = timepoint, y = coop_score))+
    geom_bar(stat = 'identity', position = 'dodge', fill = '#4575b4', color = 'black')+
    scale_y_continuous(name = 'log(context/isolation)', limits = c(0, 3.5))+
    scale_x_discrete(name = 'Motifs mapped by timepoint')+
    ggtitle('Relative cooperativity boost', subtitle = paste0('Motifs mapped only from ', y))+
    theme_classic() + theme(legend.position = 'bottom')
  abs_interp.plot<-median_m.df %>%
    dplyr::mutate(coop_score = med_perturb - med_marg) %>%
    ggplot(., aes(x = timepoint, y = coop_score))+
    geom_bar(stat = 'identity', position = 'dodge', fill = 'navy', color = 'black')+
    scale_y_continuous(name = 'context - isolation', limits = c(0, 0.4))+
    scale_x_discrete(name = 'Motifs mapped by timepoint')+
    ggtitle('Absolute cooperativity boost', subtitle = paste0('Motifs mapped only from ', y))+
    theme_classic() + theme(legend.position = 'bottom')
  acc.plot<-ggplot(median_m.df, aes(x = timepoint, y = -1 * med_deseq_acc_mut_vs_0h))+
    geom_bar(stat = 'identity', position = 'dodge', fill = 'black')+
    scale_y_continuous(name = 'Median diff DESeq2 acc (wt/mut)', limits = c(0, .85))+
    scale_x_discrete(name = 'Motifs mapped by timepoint')+
    ggtitle('Observed acc. effects', subtitle = paste0('Motifs mapped only from ', y))+
    theme_classic()
  plot<-interp.plot + rel_interp.plot + abs_interp.plot + acc.plot + plot_layout(nrow = 1, widths = c(2, 1, 1, 1))
  
  return(plot)
  
}) %>% wrap_plots(ncol = 1)
ggsave(paste0(figure_filepath, '/interp_by_mapped_motifs_separated_by_motifs_mapped_by_timepoints.png'), plot_all, height = 20, width = 12)
ggsave(paste0(figure_filepath, '/interp_by_mapped_motifs_separated_by_motifs_mapped_by_timepoints.pdf'), plot_all, height = 20, width = 12)

Assess cooperativity and affinity and distance

Plot in vivo contribution across different states

Format in vivo contribution scores across relevant motifs.

curated_islands_of_interest<-c('Oct4-Sox2', 'Oct4-Sox2_Sox2')
contrib_limits<-c(-.02, .38)

motifs_w_islands.df<-motifs.df %>% dplyr::left_join(., regions.gr %>% as.data.frame %>% dplyr::select(region_id, island_content, island_content_unique, island_count)) %>%
  dplyr::group_by(region_id) %>%
  dplyr::mutate(distance = max(end)-min(start),
                distance_bin = cut(distance, breaks=distance_class_breaks, include.lowest = T, labels = distance_class_labels),
                ic_match_bin = cut(seq_match_quantile, breaks = seq(0, 1, .2), labels = paste0('q', 1:5), include.lowest = T))%>%
  dplyr::ungroup()

os_alone_or_w_1_partner.df<-motifs_w_islands.df %>% 
  dplyr::filter(motif=='Oct4-Sox2', island_content_unique %in% curated_islands_of_interest, 
                island_count<=2, !(island_content_unique=='Oct4-Sox2' & island_count==2)) %>%
  as.data.table %>%
  dplyr::left_join(., motifs_w_islands.df %>% dplyr::filter(motif=='Sox2') %>% dplyr::select(region_id, ic_match_bin) %>% dplyr::rename(sox2_ic_match_bin = ic_match_bin)) %>%
  melt.data.table(id.vars = c('motif_id', 'island_content_unique', 'ic_match_bin', 'sox2_ic_match_bin', 'distance_bin', 'island_count'), 
                  measure.vars = names(shap.bw.list), 
                  variable.name = 'treatment', value.name = 'acc_contrib') %>%
  dplyr::mutate(treatment_numeric = gsub('h', '', treatment) %>% as.numeric())

Plot relationship to affinity.

affinity.plot<-os_alone_or_w_1_partner.df %>%
  dplyr::filter(ic_match_bin %in% c('q1','q5')) %>%
  dplyr::group_by(ic_match_bin, treatment_numeric) %>%
  dplyr::summarize(acc_contrib_med = median(acc_contrib),
                   count = dplyr::n()) %>%
  dplyr::filter(count>=count_threshold) %>%
  ggplot(., aes(x = treatment_numeric, y = acc_contrib_med, color = ic_match_bin))+
  geom_hline(yintercept = 0, linetype = 'dashed', color = 'black')+
  geom_line()+ geom_point()+
  geom_text(aes(x = Inf, y = Inf, label = count), hjust = 1, vjust = 1)+
  scale_color_manual(name = 'Affinity', values = c('#E8B4AF', '#A11E23'))+
  scale_y_continuous(name = 'Median acc. contrib', limits = contrib_limits)+
  scale_x_continuous(name = 'Treatment (h)', breaks = seq(0, 15, 3))+
  theme_classic() + theme(legend.position = 'bottom')
affinity.plot

Plot role of cooperativity.

cooperativity.plot<-os_alone_or_w_1_partner.df %>%
  dplyr::filter(ic_match_bin %in% c('q1','q5')) %>%
  dplyr::group_by(ic_match_bin, island_count, treatment_numeric) %>%
  dplyr::summarize(acc_contrib_med = median(acc_contrib),
                   count = dplyr::n()) %>%
  dplyr::filter(count>=count_threshold) %>%
  dplyr::mutate(island_count = factor(island_count, levels = unique(island_count))) %>%
  ggplot(., aes(x = treatment_numeric, y = acc_contrib_med, color = ic_match_bin, linetype = island_count))+
  geom_hline(yintercept = 0, linetype = 'dashed', color = 'black')+
  geom_line()+ geom_point()+
  geom_text(aes(x = Inf, y = Inf, label = count), hjust = 1, vjust = 1)+
  scale_color_manual(name = 'Affinity', values = c('#E8B4AF', '#A11E23'))+
  scale_y_continuous(name = 'Median acc. contrib', limits = contrib_limits)+
  scale_x_continuous(name = 'Treatment (h)', breaks = seq(0, 15, 3))+
  scale_linetype_manual(values = c('solid', 'dashed'))+
  theme_classic() + theme(legend.position = 'bottom')
cooperativity.plot

Plot role of distance at low-affinity motifs.

distance.plot<-os_alone_or_w_1_partner.df %>%
  dplyr::filter(ic_match_bin %in% c('q1','q5'), distance_bin %in% c('protein-range', 'nucleosome-range'), island_count==2) %>%
  dplyr::group_by(ic_match_bin, distance_bin, treatment_numeric) %>%
  dplyr::summarize(acc_contrib_med = median(acc_contrib),
                   count = dplyr::n()) %>%
  dplyr::filter(count>=count_threshold) %>%
  ggplot(., aes(x = treatment_numeric, y = acc_contrib_med, color = ic_match_bin, linetype = distance_bin))+
  geom_hline(yintercept = 0, linetype = 'dashed', color = 'black')+
  geom_line()+ geom_point()+
  geom_text(aes(x = Inf, y = Inf, label = count), hjust = 1, vjust = 1)+
  scale_color_manual(name = 'Affinity', values = c('#E8B4AF', '#A11E23'))+
  scale_y_continuous(name = 'Median acc. contrib', limits = contrib_limits)+
  scale_x_continuous(name = 'Treatment (h)', breaks = seq(0, 15, 3))+
  scale_linetype_manual(values = c('solid', 'dashed'))+
  theme_classic() + theme(legend.position = 'bottom')
distance.plot

Group plots together.

invivo_cooperativity.plot<-affinity.plot + cooperativity.plot + distance.plot
invivo_cooperativity.plot

ggsave(paste0(figure_filepath, '/contrib_metaplots_for_Oct4-Sox2.png'), invivo_cooperativity.plot, height = 3, width = 8)
ggsave(paste0(figure_filepath, '/contrib_metaplots_for_Oct4-Sox2.pdf'), invivo_cooperativity.plot, height = 3, width = 8)

Export median cooperativity values.

os_alone_or_w_1_partner.df %>%
  dplyr::filter(ic_match_bin %in% c('q1','q5'), distance_bin %in% c('protein-range', 'nucleosome-range')) %>%
  dplyr::group_by(ic_match_bin, distance_bin, treatment_numeric, island_count) %>%
  dplyr::rename(is_motif_paired_w_sox2 = island_count) %>%
  dplyr::mutate(is_motif_paired_w_sox2 = ifelse(is_motif_paired_w_sox2==2, T, F)) %>%
  dplyr::summarize(acc_contrib_med = median(acc_contrib),
                   count = dplyr::n()) %>%
  dplyr::arrange(is_motif_paired_w_sox2, distance_bin, ic_match_bin, treatment_numeric) %>%
  readr::write_tsv(., 'tsv/genomic/os_vs_s_cooperativity_medians_across_timepoints.tsv.gz')
os_alone_or_w_1_partner.df %>%
  dplyr::filter(ic_match_bin %in% c('q1','q5'), distance_bin %in% c('protein-range', 'nucleosome-range')) %>%
  dplyr::group_by(ic_match_bin, distance_bin, treatment_numeric, island_count, sox2_ic_match_bin) %>%
  dplyr::rename(is_motif_paired_w_sox2 = island_count) %>%
  dplyr::mutate(is_motif_paired_w_sox2 = ifelse(is_motif_paired_w_sox2==2, T, F)) %>%
  dplyr::summarize(acc_contrib_med = median(acc_contrib),
                   count = dplyr::n()) %>%
  dplyr::arrange(is_motif_paired_w_sox2, distance_bin, ic_match_bin, treatment_numeric) %>%
  readr::write_tsv(., 'tsv/genomic/os_vs_s_cooperativity_medians_across_timepoints_with_expanded_sox2_affinity_groups.tsv.gz')

Session Information

For the purposes of reproducibility, the R/Bioconductor session information is printed below:

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Rocky Linux 8.9 (Green Obsidian)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so;  LAPACK version 3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/Chicago
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] DESeq2_1.44.0                            
##  [2] SummarizedExperiment_1.34.0              
##  [3] MatrixGenerics_1.16.0                    
##  [4] matrixStats_1.5.0                        
##  [5] ggseqlogo_0.2                            
##  [6] TxDb.Mmusculus.UCSC.mm10.knownGene_3.10.0
##  [7] GenomicFeatures_1.56.0                   
##  [8] AnnotationDbi_1.66.0                     
##  [9] Biobase_2.64.0                           
## [10] BSgenome.Mmusculus.UCSC.mm10_1.4.3       
## [11] BSgenome_1.72.0                          
## [12] BiocIO_1.14.0                            
## [13] viridis_0.6.5                            
## [14] viridisLite_0.4.2                        
## [15] digest_0.6.37                            
## [16] pander_0.6.6                             
## [17] lattice_0.22-6                           
## [18] testit_0.13                              
## [19] readr_2.1.5                              
## [20] patchwork_1.3.0                          
## [21] data.table_1.17.0                        
## [22] dplyr_1.1.4                              
## [23] Rsamtools_2.20.0                         
## [24] plyranges_1.24.0                         
## [25] reshape2_1.4.4                           
## [26] ggplot2_3.5.2                            
## [27] Biostrings_2.72.1                        
## [28] XVector_0.44.0                           
## [29] magrittr_2.0.3                           
## [30] rtracklayer_1.64.0                       
## [31] GenomicRanges_1.56.2                     
## [32] GenomeInfoDb_1.40.1                      
## [33] IRanges_2.38.1                           
## [34] S4Vectors_0.42.1                         
## [35] BiocGenerics_0.50.0                      
## 
## loaded via a namespace (and not attached):
##  [1] DBI_1.2.3                bitops_1.0-9             gridExtra_2.3           
##  [4] writexl_1.5.4            rlang_1.1.4              compiler_4.4.1          
##  [7] RSQLite_2.3.9            systemfonts_1.2.3        png_0.1-8               
## [10] vctrs_0.6.5              stringr_1.5.1            pkgconfig_2.0.3         
## [13] crayon_1.5.3             fastmap_1.2.0            backports_1.5.0         
## [16] labeling_0.4.3           utf8_1.2.4               rmarkdown_2.29          
## [19] tzdb_0.4.0               ggbeeswarm_0.7.2         UCSC.utils_1.0.0        
## [22] ragg_1.3.3               purrr_1.0.2              bit_4.6.0               
## [25] xfun_0.51                zlibbioc_1.50.0          cachem_1.1.0            
## [28] jsonlite_1.8.9           blob_1.2.4               DelayedArray_0.30.1     
## [31] BiocParallel_1.38.0      broom_1.0.7              R6_2.5.1                
## [34] bslib_0.8.0              stringi_1.8.4            car_3.1-3               
## [37] jquerylib_0.1.4          Rcpp_1.0.14              knitr_1.50              
## [40] Matrix_1.7-0             tidyselect_1.2.1         rstudioapi_0.17.1       
## [43] abind_1.4-8              yaml_2.3.10              codetools_0.2-20        
## [46] curl_6.2.1               tibble_3.2.1             plyr_1.8.9              
## [49] withr_3.0.2              KEGGREST_1.44.1          ggrastr_1.0.2           
## [52] evaluate_1.0.3           ggpubr_0.6.0             pillar_1.10.2           
## [55] carData_3.0-5            generics_0.1.3           vroom_1.6.5             
## [58] RCurl_1.98-1.16          hms_1.1.3                munsell_0.5.1           
## [61] scales_1.3.0             glue_1.8.0               tools_4.4.1             
## [64] ggsignif_0.6.4           locfit_1.5-9.10          GenomicAlignments_1.40.0
## [67] XML_3.99-0.18            Cairo_1.6-2              grid_4.4.1              
## [70] tidyr_1.3.1              colorspace_2.1-1         GenomeInfoDbData_1.2.12 
## [73] beeswarm_0.4.0           vipor_0.4.7              restfulr_0.0.15         
## [76] Formula_1.2-5            cli_3.6.3                textshaping_1.0.0       
## [79] S4Arrays_1.4.1           gtable_0.3.6             rstatix_0.7.2           
## [82] sass_0.4.9               SparseArray_1.4.8        rjson_0.2.23            
## [85] farver_2.1.2             memoise_2.0.1            htmltools_0.5.8.1       
## [88] lifecycle_1.0.4          httr_1.4.7               bit64_4.5.2